In [1]:
import pandas as pd
import sqlite3 as sq
from datetime import datetime
import numpy as np

In [3]:
pd.options.display.max_columns=999

In [4]:
con_nih = sq.connect('../nih.db')

In [5]:
# only NIH projects
sql = '''
Select PI_IDS, PI_NAMEs, CORE_PROJECT_NUM, FY, ADMINISTERING_IC, TOTAL_COST, TOTAL_COST_SUB_PROJECT, ACTIVITY, FUNDING_MECHANISM
From projects
Where ADMINISTERING_IC in ('CC','RG', 'CIT', 'TW', 'TR', 'AT',
    'CA', 'RR', 'EY', 'HG', 'HL', 'AG', 'AA', 'AI', 'AR', 'EB', 'HD',
    'DA', 'DC', 'DE', 'DK', 'ES', 'GM', 'MH', 'MD', 'NS', 'NR', 'LM', 'OD' )
'''
df_projects = pd.read_sql(sql, con_nih)

In [6]:
df_projects.FUNDING_MECHANISM.unique()

array(['Non-SBIR/STTR RPGs', 'TRAINING, INDIVIDUAL', 'RESEARCH CENTERS',
       'OTHER RESEARCH-RELATED', 'TRAINING, INSTITUTIONAL',
       'SBIR-STTR RPGs', None, 'Contracts, Extramural', 'Construction',
       'Other Research Related', 'Research Projects',
       'Training, Individual', 'Research Centers', 'Unknown', 'SBIR-STTR',
       'Training, Institutional', 'Intramural Research', 'Other',
       'Non SBIR/STTR Contracts', 'INTRAMURAL RESEARCH',
       'INTERAGENCY AGREEMENTS', 'SBIR/STTR Contracts', 'OTHERS',
       'CONSTRUCTION GRANTS'], dtype=object)

In [7]:
# Missing FUNDING_MECHANISM in Earlier Years 
# So Use activity to filter data 
research_activities = df_projects.ACTIVITY[df_projects.FUNDING_MECHANISM.isin(['RESEARCH CENTERS','Research Projects', 'Research Centers'])].unique()
df_projects = df_projects[df_projects.ACTIVITY.isin(research_activities)]
# filter projects that has total costs less than $50,000
df_projects['TOTAL_COST'] = df_projects[['TOTAL_COST']].where(~df_projects['TOTAL_COST'].isnull(),0)
df_projects['TOTAL_COST_SUB_PROJECT'] = df_projects[['TOTAL_COST_SUB_PROJECT']].where(~df_projects['TOTAL_COST_SUB_PROJECT'].isnull(),0)
df_projects['TOTAL_COST']  = df_projects['TOTAL_COST'] + df_projects['TOTAL_COST_SUB_PROJECT']
#only look at 2001-2015
before_2001 = df_projects['FY'] < 2001
after_2015  = df_projects['FY'] > 2015
df_projects = df_projects[(~before_2001) & (~after_2015)]
#save some project edge attribute
#CORE_PROJECT_NUM contains activity codes
df_proj_uniq = df_projects.drop_duplicates('CORE_PROJECT_NUM')
df_proj_uniq.to_csv('../data/prj_attribute.csv', index=False, header=True)

#### Retrieve each indiviual pid

In [11]:
df_temp = df_projects[['PI_IDS','PI_NAMEs','CORE_PROJECT_NUM', 'FY', 'ADMINISTERING_IC', 'TOTAL_COST']]
df_temp.ADMINISTERING_IC.where(df_temp.ADMINISTERING_IC == 'GM', 'others', inplace=True)
df_temp.ADMINISTERING_IC.where(df_temp.ADMINISTERING_IC == 'others', 'GM', inplace=True)
df_temp.ADMINISTERING_IC.unique()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


array(['others', 'GM'], dtype=object)

In [105]:
#create pid and name pairs
vals = df_temp.values;
rows, cols = vals.shape
p_ids = []
p_names = []
p_nums = []
fys = []
ics = []
costs = []
#grab each pid from the ';' separated strings
for i in range(0,rows):
    if vals[i, 0] is not None and vals[i, 1] is not None:
        ids = vals[i, 0].strip().split(';')
        names = vals[i,1].strip().split(';')
        ids.pop()
        names.pop()
        if len(ids) == len(names):
            #if there are more than one pid, take primary contact
            if len(ids) == 1:
                ids = [d.strip() for d in ids ]
                names = [n.strip() for n in names]
            else:
                #ids = [d.replace(' (contact)', '').strip() for d in ids if '(contact)'   in d]
                #names = [n.replace(' (contact)', '').strip() for n in names if '(contact)' in n]
                ids = [d.replace(' (contact)', '').strip() for d in ids ]
                names = [n.replace(' (contact)', '').strip() for n in names]
            p_num = vals[i,2]
            fy = vals[i,3]
            ic = vals[i,4]
            cost = vals[i,5]
            p_ids.extend(ids)
            p_names.extend(names)
            p_nums.extend([str(p_num)] * len(ids))
            fys.extend([fy] * len(ids))
            ics.extend([ic] * len(ids))
            costs.extend([cost /  len(ids)] * len(ids))


In [106]:
data = {
    "pid": p_ids,
    "p_names": p_names,
    "p_nums": p_nums,
    "fys" : fys,
    "ics" : ics,
    "costs" : costs
}
df_temp = pd.DataFrame(data)

In [107]:
df_temp.shape

(903129, 6)

In [108]:
df_cost = df_temp[['pid', 'fys', 'costs']]
df_pid_costsum_by_year = df_cost.groupby(['pid','fys']).sum()
df_pid_costsum_by_year.reset_index(inplace=True)
df_pid_costsum_by_year = df_pid_costsum_by_year[df_pid_costsum_by_year.costs >= 50000]
df_pid_costsum_by_year = df_pid_costsum_by_year[df_pid_costsum_by_year.pid !='']

In [109]:
df_pid_costsum_by_year.shape

(467505, 3)

In [110]:
df_pid_costsum_by_year.head(5)

Unnamed: 0,pid,fys,costs
4,10000396,2010,125610.0
5,10000396,2011,142429.0
6,10000396,2012,154636.0
7,10000396,2013,151791.0
8,10000396,2014,128800.0


In [111]:
df_temp.head(5)

Unnamed: 0,costs,fys,ics,p_names,p_nums,pid
0,174299.0,2015,others,"LIU, SHAN-LU",R21AI109464,10637051
1,25000.0,2014,others,"NAJJAR, SONIA M.",R01DK054254,1936803
2,7082.5,2015,others,"NAVAS-ACIEN, ANA",R01ES021367,8696712
3,7082.5,2015,others,"VAIDYA, DHANANJAY MADHUKAR",R01ES021367,8656693
4,349856.0,2015,others,"WILLIAMS, TYISHA",R15NS084329,14584389


#### Take out researchers who have supports less than $50000 per year

In [112]:
df_temp = df_temp[df_temp.pid.isin(df_pid_costsum_by_year.pid)]

In [113]:
p_ids = list(df_temp.pid)
p_names = list(df_temp.p_names)
p_nums = list(df_temp.p_nums)
fys = list(df_temp.fys)
ics = list(df_temp.ics)

In [114]:
def getUniqDF(arrayList):
    #taking equal-length arrays, create  unique combinations
    cs = ['_'.join([str(a) for a in A]) for A in zip(*arrayList)]
    uniq_comb = list(set(cs))
    a_len = len(arrayList)
    outputs = [];
    for i in range(a_len):
        outputs.append([])
    #unique combinations
    for c in uniq_comb:
        s = c.split('_')
        for si in range(a_len):
            outputs[si].append(s[si])
    return(outputs)
# pid and names
uniq_pids, uniq_names = getUniqDF([p_ids, p_names])
df_pid_name = pd.DataFrame({'pid': uniq_pids, 'full_name':uniq_names})

# pids and project number
pid4num, uniq_prjnum = getUniqDF([p_ids, p_nums])
df_pid_prjnum = pd.DataFrame({'pid': pid4num, 'project_key':uniq_prjnum})

# pids, fy, and ic
pid4fy, fys_support, ics_support = getUniqDF([p_ids, fys, ics])   
df_pid_fy_ic = pd.DataFrame({'pid': pid4fy, 'fy':fys_support, 'ic': ics_support })



In [115]:
#calculate years of support 
df_pid_fy_ic['val'] = 1
df_sum = df_pid_fy_ic[['pid', 'ic', 'val']]
df_support = df_sum.pivot_table(columns='ic', index='pid', values='val', aggfunc='sum')
df_support.columns = ['nigms_years', 'others_years']
df_support['nih_years'] =  df_support[['nigms_years', 'others_years']].apply(np.max, axis=1)



In [116]:
df_support.head(5)

Unnamed: 0_level_0,nigms_years,others_years,nih_years
pid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10000396,,5.0,5.0
10000490,,2.0,2.0
10001078,,3.0,3.0
10001128,,1.0,1.0
10001159,,1.0,1.0


### NIGMS researchers

In [117]:
#definition of continually supported
nigms_inds = df_support.nigms_years>1
nigms_contin_inds = df_support.nigms_years==15
nih_contin_inds = (df_support.nih_years==15) & (df_support.nigms_years > 1)
df_support['is_study'] = 0
df_support['is_comp'] = 0
df_support['is_nigms'] = 0
df_support.ix[nigms_contin_inds, 'is_study' ] = 1
df_support.ix[nih_contin_inds & (~nigms_contin_inds), 'is_comp' ] = 1
df_support.ix[nigms_inds, 'is_nigms'] = 1

In [118]:
df_support[nigms_inds].index

Index(['10008168', '10010745', '10010983', '10011950', '10021299', '10028321',
       '10036621', '10039101', '10058046', '10058461',
       ...
       '9971733', '9974889', '9983921', '9984179', '9985280', '9987162',
       '9996634', '9996881', '9997153', '9997198'],
      dtype='object', name='pid', length=8665)

In [119]:
len(df_support[nigms_inds]) 

8665

#### Continually Supported by NIGMS

In [120]:
len(df_support.ix[nigms_contin_inds, 'is_study' ])

523

#### Researchers who were continually supported by NIH

In [121]:
len(df_support.ix[nih_contin_inds & (~nigms_contin_inds), 'is_comp' ])

458

### Compare with Andrew pids

In [122]:
study_set = set(df_support.ix[nigms_contin_inds, 'is_study' ].index)
comp_set = set(df_support.ix[nih_contin_inds & (~nigms_contin_inds), 'is_comp' ].index) 
whole_set = study_set | comp_set
andrew_df = pd.read_csv('../data/andrew_pids.csv', header =0)
andrew_df = andrew_df.applymap(str)
andrew_set = set(andrew_df.iloc[:,0].values)

In [123]:
our_notin_andrew = (whole_set - andrew_set)
len(our_notin_andrew)

272

In [124]:
andrew_notin_ours = andrew_set - whole_set
len(andrew_notin_ours)

235

In [125]:
join_set = whole_set & andrew_set
len(join_set)

709

### Create Author Table

In [126]:
df_pid_name.drop_duplicates(['pid'], inplace=True)
df_pid_name = df_pid_name[~df_pid_name.pid.isnull()]
df_pid_name = df_pid_name[df_pid_name.full_name != 'WEDEN, MARGARET']

In [127]:
temp_df = df_pid_name.copy()
temp_df['pi_key'] = 'nih'+'_'+ temp_df['pid']
temp_df['full_name'] = temp_df['full_name'].str.lower()
temp_df['last_name'] = temp_df['full_name'].str.split(',').str.get(0)
temp_df['f_m_name'] = temp_df['full_name'].str.split(',').str.get(1)
temp_df['f_m_name'] = temp_df['f_m_name'].str.strip()
temp_df['first_name'] = temp_df['f_m_name'].str.split(' ').str.get(0)
temp_df['middle_name'] = temp_df['f_m_name'].str.split(' ').str.get(1)
temp_df['last_initial'] = temp_df['last_name'].str[0]
temp_df['first_initial'] = temp_df['first_name'].str[0]
temp_df['middle_initial'] = temp_df['middle_name'].str[0]



In [128]:
temp_df.shape

(79763, 10)

In [129]:
temp_df = temp_df.merge(df_support[['is_study', 'is_comp']], left_on='pid', right_index=True)
temp_df.drop(['f_m_name', 'pid'], axis=1, inplace=True)

In [130]:
len(temp_df[temp_df.is_comp == 1])

458

In [131]:
temp_df.head(5)

Unnamed: 0,full_name,pi_key,last_name,first_name,middle_name,last_initial,first_initial,middle_initial,is_study,is_comp
0,"rubin, j peter",nih_7667625,rubin,j,peter,r,j,p,0,0
1,"vaska, paul",nih_7006391,vaska,paul,,v,p,,0,0
2,"bound, john",nih_10234238,bound,john,,b,j,,0,0
3,"slayden, richard a",nih_7281791,slayden,richard,a,s,r,a,0,0
4,"rossi-george, alba",nih_8691731,rossi-george,alba,,r,a,,0,0


In [132]:
#create an author table
con_analysis_db = sq.connect('nih_analyses.db')

In [133]:
temp_df.to_sql('researcher',con_analysis_db, if_exists='replace')
con_analysis_db.execute('create unique index pi_key_index on researcher(pi_key);')

<sqlite3.Cursor at 0x1627f8500>

In [134]:
df_author = temp_df

### Create Author and Project Link table

In [135]:
temp_df = df_pid_prjnum.copy()
temp_df['pi_key'] = 'nih' + '_' + temp_df['pid']
temp_df.drop('pid', axis=1, inplace=True)
temp_df.to_sql('res_prj',con_analysis_db, if_exists='replace')
con_analysis_db.execute('create index pi_key_index_res_prj on res_prj(pi_key);')
con_analysis_db.execute('create index prjnum_index_res_prj on res_prj(project_key);')

<sqlite3.Cursor at 0x129f60880>

In [151]:
#lookup uniq project number for each author
df_study = df_author[df_author.is_study == 1]
df_comp = df_author[df_author.is_comp == 1]
df_study = df_study.merge(temp_df, on='pi_key', how='left') 
df_comp = df_comp.merge(temp_df, on='pi_key', how='left') 

### Create Project and Publication Link table

In [152]:
sql = '''
Select * 
From link
'''

df_prj_pub_link = pd.read_sql(sql, con_nih)

In [153]:
df_prj_pub_link.columns = ['publication_key' , 'project_key']

In [157]:
df_prj_pub_link.to_sql('prj_pub',con_analysis_db, if_exists='replace')
con_analysis_db.execute('create index prjnum_index_prj_pub on prj_pub(project_key);')
con_analysis_db.execute('create index pubkey_index_prj_pub on prj_pub(publication_key);')

<sqlite3.Cursor at 0x1627f8180>

In [158]:
df_prj_pub_link[df_prj_pub_link.publication_key==26538025]

Unnamed: 0,publication_key,project_key
20029,26538025,P41RR011823
159760,26538025,R01GM089970
183808,26538025,R01HL128370
216902,26538025,R13HL126239
219603,26538025,P41GM103533


In [160]:
df_study = df_study.merge(df_prj_pub_link, on='project_key', how='left')
df_comp = df_comp.merge(df_prj_pub_link, on='project_key', how='left')


### Join Author with Publications

In [165]:
sql = '''
Select * 
From publication
'''
df_pub = pd.read_sql(sql, con_nih)
df_study = df_study.merge(df_pub, left_on='publication_key',  right_on='PMID', how='left')
df_comp = df_comp.merge(df_pub, left_on='publication_key',  right_on='PMID', how='left')
df_study.to_csv('../data/study_pub.csv', header=True, index=False)
df_comp.to_csv('../data/comp_pub.csv', header=True, index=False)

### We uses project number to link researcher to publications.
### Researchers may be associated with projectis, but it does not mean they are authors of those publications

In [None]:
df_study = pd.read_csv('../data/study_pub.csv', header=0)
df_comp = pd.read_csv('../data/comp_pub.csv', header=0)
def is_author(row):
    fn = row['first_name']
    ln = row['last_name']
    fn = fn.capitalize()
    ln = ln.capitalize()
    full_name = ln + ', ' + fn
    authors = str(row['AUTHOR_LIST'])
    if full_name in authors:
        row['is_author'] = True
    else:
        row['is_author'] = False
    return row
df_study = df_study.apply(is_author, axis=1)
df_study_author = df_study[df_study.is_author==True]
df_comp = df_comp.apply(is_author, axis=1)
df_comp_author = df_comp[df_comp.is_author==True]
df_study_no_author = df_study[df_study.is_author==False]
df_comp_no_author = df_comp[df_comp.is_author==False]


In [7]:
df_proj_uniq = pd.read_csv('../data/prj_attribute.csv', header=0)
### Add some additional project attributes 
df_study_author = df_study_author.merge(df_proj_uniq, left_on='project_key', right_on='CORE_PROJECT_NUM', how='left')
df_comp_author = df_comp_author.merge(df_proj_uniq, left_on='project_key', right_on='CORE_PROJECT_NUM', how='left')
df_study_no_author = df_study_no_author.merge(df_proj_uniq, left_on='project_key', right_on='CORE_PROJECT_NUM', how='left')
df_comp_no_author =df_comp_no_author.merge(df_proj_uniq, left_on='project_key', right_on='CORE_PROJECT_NUM', how='left')
df_study_author.to_csv('../data/study_author_pub.csv', index=False, header=True)
df_comp_author.to_csv('../data/comp_author_pub.csv', index=False, header=True)
df_study_no_author.to_csv('../data/study_no_author_pub.csv', index=False, header=True)
df_comp_no_author.to_csv('../data/comp_no_author_pub.csv', index=False, header=True)

In [21]:
df_study_author_uniq = df_study_author.drop_duplicates('pi_key')
df_study_no_author_uniq = df_study_no_author.drop_duplicates('pi_key')
df_comp_author_uniq = df_comp_author.drop_duplicates('pi_key')
df_comp_no_author_uniq = df_comp_no_author.drop_duplicates('pi_key')
print('The number of study author: {0:d}'.format(len(df_study_author_uniq)))
print('The number of comp author: {0:d}'.format(len(df_comp_author_uniq)))





The number of study author: 487
The number of comp author: 416
