In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
grants_analysis = pd.read_csv('for_analysis.csv', compression = 'gzip')
grants_analysis.info()
grants_analysis.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84591 entries, 0 to 84590
Data columns (total 20 columns):
application_id       84591 non-null int64
activity             84574 non-null object
application_type     82964 non-null float64
arra_funded          84574 non-null object
full_project_num     84591 non-null object
funding_ics          84436 non-null object
funding_mechanism    84558 non-null object
fy                   84591 non-null int64
nih_spending_cats    73811 non-null object
project_start        70517 non-null object
project_end          70780 non-null object
project_terms        81585 non-null object
study_section        78863 non-null object
support_year         82964 non-null float64
funds                82464 non-null float64
pi_ids               84358 non-null float64
num_pis              84591 non-null int64
org_name             84275 non-null object
org_country          84338 non-null object
org_zipcode          84165 non-null object
dtypes: float64(4), int64(3), 

(84591, 20)

In [7]:
col_info = pd.read_csv('grant_col_info_all.csv')
pd.set_option('display.max_colwidth', 500)
col_info.ix[col_info['column_name'].str.contains('funding')]
pd.set_option('display.max_colwidth', 50)

Unnamed: 0,column_name,descriptions
13,funding_ic(s),"The NIH Institute or Center(s) providing funding for a project are designated by their acronyms (see Institute/Center acronyms ). Each funding IC is followed by a colon (:) and the amount of funding provided for the fiscal year by that IC. Multiple ICs are separated by semicolons (;). Project funding information is available only for NIH, CDC, and FDA projects ."
14,funding_mechanism,"The major mechanism categories used in NIH Budget mechanism tables for the President’s budget. Extramural research awards are divided into three main funding mechanisms: grants, cooperative agreements and contracts. A funding mechanism is the type of funded application or transaction used at the NIH. Within each funding mechanism NIH includes programs. Programs can be further refined by specific activity codes."


In [8]:
grants_analysis.isnull().sum()

application_id           0
activity                17
application_type      1627
arra_funded             17
full_project_num         0
funding_ics            155
funding_mechanism       33
fy                       0
nih_spending_cats    10780
project_start        14074
project_end          13811
project_terms         3006
study_section         5728
support_year          1627
funds                 2127
pi_ids                 233
num_pis                  0
org_name               316
org_country            253
org_zipcode            426
dtype: int64

Most nulls will not affect the ability to identify who the grant was awarded to. The identification columns are application_id (no nulls), pi_ids, org_name, org_country and org_zipcode. org_zipcode has the most nulls, but this is because several countries did not list a zipcode. The second largest column of the identiers is org_name.

In [9]:
org_null = grants_analysis.ix[grants_analysis['org_name'].isnull()]
org_null['funding_mechanism'].value_counts()

non sbir/sttr contracts    148
interagency agreements     124
intramural research         23
sbir/sttr contracts          4
Name: funding_mechanism, dtype: int64

Non-US countries awarded with NIH grants.

In [10]:
grants_analysis.ix[grants_analysis['org_country'] != 'united states'].org_country.value_counts()

south africa      155
canada            154
united kingdom     90
uganda             66
switzerland        34
kenya              34
ethiopia           33
australia          32
nigeria            30
brazil             28
germany            27
india              27
peru               27
botswana           25
zambia             22
tanzania u rep     20
malawi             19
thailand           19
mozambique         19
france             19
china              17
ghana              15
mali               14
zimbabwe           12
argentina          12
cote d'ivoire      11
haiti              10
colombia            9
netherlands         9
vietnam             8
                 ... 
sri lanka           2
austria             2
senegal             2
iceland             2
georgia             2
tajikistan          2
nicaragua           2
japan               2
costa rica          2
panama              2
jordan              2
hong kong           2
indonesia           2
norway              1
lesotho   

In [11]:
def to_string(df, col_list):
    for col in col_list:
        df[col] = df[col].astype(str)
    return df

cols = ['application_id', 'application_type', 'fy', 'support_year', 'pi_ids']
grants_analysis = to_string(grants_analysis, col_list = cols)

In [12]:
print('The total amount of funding awarded by the NIH in 2016 was $' + \
      '{0:,}'.format(grants_analysis['funds'].sum()) + ' for ' \
     + '{0:,}'.format(len(grants_analysis)) + ' total grants.')

print('Approximately $' + '{0:,}'.format(round(grants_analysis['funds'].sum() / len(grants_analysis))) + \
      ' were awarded per grant on average.')

print('{0:,}'.format(grants_analysis['pi_ids'].value_counts().count()) + ' unique PIs and ' + \
      str(grants_analysis['org_name'].value_counts().count()) + ' unique organizations were listed.')

The total amount of funding awarded by the NIH in 2016 was $35,261,559,762.0 for 84,591 total grants.
Approximately $416,848.0 were awarded per grant on average.
48,753 unique PIs and 3210 unique organizations were listed.


In [13]:
by_activity = grants_analysis.groupby('activity')
cost_activity = by_activity.sum().sort_values('funds', ascending = False)

In [14]:
s = pd.Series(by_activity.size(), name = 'num_grants')
cost_activity = pd.concat([cost_activity, s], axis = 1)
cost_activity['funds_per_grant'] = cost_activity['funds'] / cost_activity['num_grants']
cost_activity['pi_per_grant'] = cost_activity['num_pis'] / cost_activity['num_grants']
cost_activity.head()

Unnamed: 0,funds,num_pis,num_grants,funds_per_grant,pi_per_grant
d43,36571074.0,341,193,189487.4,1.766839
d71,348784.0,29,15,23252.27,1.933333
dp1,67354076.0,81,81,831531.8,1.0
dp2,134316415.0,68,68,1975241.0,1.0
dp3,39468693.0,54,28,1409596.0,1.928571


In [15]:
cost_activity.sort_values('num_pis', ascending = False)

Unnamed: 0,funds,num_pis,num_grants,funds_per_grant,pi_per_grant
r01,1.070060e+10,44670,30870,3.466344e+05,1.447036
u01,1.803347e+09,8483,3757,4.799964e+05,2.257919
r21,9.234054e+08,7556,5420,1.703700e+05,1.394096
u54,9.225557e+08,4877,2095,4.403607e+05,2.327924
p01,1.756735e+09,4414,3518,4.993562e+05,1.254690
p30,1.302126e+09,3915,3787,3.438412e+05,1.033800
u19,1.010786e+09,2955,1363,7.415890e+05,2.168012
t32,5.649902e+08,2949,2155,2.621764e+05,1.368445
zia,2.172337e+09,2818,2816,7.714266e+05,1.000710
r25,1.959151e+08,2424,1230,1.592806e+05,1.970732


In [16]:
cost_activity['series'] = cost_activity.index
for i in range(len(cost_activity)):
    cost_activity['series'][i] = cost_activity['series'][i][0]

cost_activity

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


Unnamed: 0,funds,num_pis,num_grants,funds_per_grant,pi_per_grant,series
d43,3.657107e+07,341,193,1.894874e+05,1.766839,d
d71,3.487840e+05,29,15,2.325227e+04,1.933333,d
dp1,6.735408e+07,81,81,8.315318e+05,1.000000,d
dp2,1.343164e+08,68,68,1.975241e+06,1.000000,d
dp3,3.946869e+07,54,28,1.409596e+06,1.928571,d
dp5,3.090908e+07,79,79,3.912542e+05,1.000000,d
dp7,6.525132e+06,131,45,1.450029e+05,2.911111,d
e11,2.499930e+05,1,1,2.499930e+05,1.000000,e
f30,2.497845e+07,624,624,4.002957e+04,1.000000,f
f31,5.071261e+07,1419,1419,3.573828e+04,1.000000,f


In [17]:
grants_analysis['num_pis'].value_counts()
#Power law?
#plot on histogram

1     62183
2     15112
3      4458
4      1632
5       515
8       200
6       198
7       105
12       96
9        81
11       11
Name: num_pis, dtype: int64

In [None]:
#geospatial data: grants per region
#Questions for analysis: how many grants are solo vs. joint? how many PIs have solo vs. joint grants?
#How many grants per organization?
#Total costs, avg cost per PI, institution