In [1]:
import pandas as pd
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [244]:
all_info = pd.read_csv('grants_all.csv')
pi_info_raw = all_info['pi_ids pi_names org_name org_city org_state org_country org_zipcode'.split()]

In [245]:
pi_info_raw.head()

Unnamed: 0,pi_ids,pi_names,org_name,org_city,org_state,org_country,org_zipcode
0,10799126;,"scott, stuart alexander;",icahn school of medicine at mount sinai,new york,ny,united states,100296574
1,7017365;,"polster, brian m;",university of maryland baltimore,baltimore,md,united states,212011508
2,7354078; 1862210 (contact);,"walton, maureen a; zucker, robert alpert (cont...",university of michigan,ann arbor,mi,united states,481091276
3,2275890 (contact); 8742217; 6139020;,"dulac, catherine g (contact); regev, aviv ; zh...",harvard university,cambridge,ma,united states,21385369
4,9868481; 2063694 (contact);,"chen, xuesong ; geiger, jonathan david (contact);",university of north dakota,grand forks,nd,united states,582026059


In [246]:
def strip_string(df, *args):
    '''
    
    Strips final character from each column in a df.
    
    '''
    for arg in args:
        df = df.apply(lambda s: s.str.strip(arg))
    return df

pi_info_cleaned = strip_string(pi_info_raw, ' ', ';', '.')

In [247]:
pi_info_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71872 entries, 0 to 71871
Data columns (total 7 columns):
pi_ids         71872 non-null object
pi_names       71872 non-null object
org_name       71766 non-null object
org_city       68527 non-null object
org_state      67740 non-null object
org_country    68528 non-null object
org_zipcode    68363 non-null object
dtypes: object(7)
memory usage: 3.8+ MB


In [248]:
pi_info_cleaned.head()

Unnamed: 0,pi_ids,pi_names,org_name,org_city,org_state,org_country,org_zipcode
0,10799126,"scott, stuart alexander",icahn school of medicine at mount sinai,new york,ny,united states,100296574
1,7017365,"polster, brian m",university of maryland baltimore,baltimore,md,united states,212011508
2,7354078; 1862210 (contact),"walton, maureen a; zucker, robert alpert (cont...",university of michigan,ann arbor,mi,united states,481091276
3,2275890 (contact); 8742217; 6139020,"dulac, catherine g (contact); regev, aviv ; zh...",harvard university,cambridge,ma,united states,21385369
4,9868481; 2063694 (contact),"chen, xuesong ; geiger, jonathan david (contact)",university of north dakota,grand forks,nd,united states,582026059


## Splitting PI Info

In [265]:
multi_pi_unsplit = pi_info_cleaned.ix[pi_info_cleaned['pi_ids'].str.contains('contact')]

In [273]:
pi_info.head()

Unnamed: 0,pi_ids,pi_names,org_name,org_city,org_state,org_country,org_zipcode
0,10799126,"scott, stuart alexander",icahn school of medicine at mount sinai,new york,ny,united states,100296574
1,7017365,"polster, brian m",university of maryland baltimore,baltimore,md,united states,212011508
5,1866930,"so, peter t",massachusetts institute of technology,cambridge,ma,united states,21421029
7,11044822,"gade, terence p",university of pennsylvania,philadelphia,pa,united states,191046205
8,9335858,"qi, ling",cornell university,ithaca,ny,united states,148502820


In [252]:
multi_pi_unsplit.head()

Unnamed: 0,pi_ids,pi_names,org_name,org_city,org_state,org_country,org_zipcode
2,7354078; 1862210 (contact),"walton, maureen a; zucker, robert alpert (cont...",university of michigan,ann arbor,mi,united states,481091276
3,2275890 (contact); 8742217; 6139020,"dulac, catherine g (contact); regev, aviv ; zh...",harvard university,cambridge,ma,united states,21385369
4,9868481; 2063694 (contact),"chen, xuesong ; geiger, jonathan david (contact)",university of north dakota,grand forks,nd,united states,582026059
6,9851446; 2291297; 12280974; 9292719 (contact);...,"eden, uri tzvi; frank, loren m; ganguli, surya...",cold spring harbor laboratory,cold spring harbor,ny,united states,117242209
12,10940848; 1897156 (contact),"lozano, andres m.; lyketsos, constantine g (co...",johns hopkins university,baltimore,md,united states,212051832


In [298]:
def split_rows_pis(df, col_name):
    '''
    
    Different PIs are delimited by a ';'.
    Split on ';'
    
    '''
    df_copy = df.copy()
    s = df_copy[col_name].str.split(';').apply(pd.Series, 1).stack()
    s.index = s.index.droplevel(-1)
    s.name = col_name
    del df_copy[col_name]
    return df_copy.join(s)

multi_pi = split_rows_pis(multi_pi_unsplit, 'pi_ids')
multi_pi.head()

Unnamed: 0,pi_names,org_name,org_city,org_state,org_country,org_zipcode,pi_ids
2,"walton, maureen a; zucker, robert alpert (cont...",university of michigan,ann arbor,mi,united states,481091276,7354078
2,"walton, maureen a; zucker, robert alpert (cont...",university of michigan,ann arbor,mi,united states,481091276,1862210 (contact)
3,"dulac, catherine g (contact); regev, aviv ; zh...",harvard university,cambridge,ma,united states,21385369,2275890 (contact)
3,"dulac, catherine g (contact); regev, aviv ; zh...",harvard university,cambridge,ma,united states,21385369,8742217
3,"dulac, catherine g (contact); regev, aviv ; zh...",harvard university,cambridge,ma,united states,21385369,6139020


In [299]:
multi_stripped = multi_pi.copy()
multi_stripped['pi_ids'] = multi_stripped['pi_ids'].str.strip(' (contact)')
multi_stripped.head()

Unnamed: 0,pi_names,org_name,org_city,org_state,org_country,org_zipcode,pi_ids
2,"walton, maureen a; zucker, robert alpert (cont...",university of michigan,ann arbor,mi,united states,481091276,7354078
2,"walton, maureen a; zucker, robert alpert (cont...",university of michigan,ann arbor,mi,united states,481091276,1862210
3,"dulac, catherine g (contact); regev, aviv ; zh...",harvard university,cambridge,ma,united states,21385369,2275890
3,"dulac, catherine g (contact); regev, aviv ; zh...",harvard university,cambridge,ma,united states,21385369,8742217
3,"dulac, catherine g (contact); regev, aviv ; zh...",harvard university,cambridge,ma,united states,21385369,6139020


In [367]:
pi_info.shape
pi_unique = pi_info.drop_duplicates('pi_ids')
pi_unique.shape
pi_info.head()

(62262, 7)

(40104, 7)

Unnamed: 0,pi_ids,pi_names,org_name,org_city,org_state,org_country,org_zipcode
0,10799126,"scott, stuart alexander",icahn school of medicine at mount sinai,new york,ny,united states,100296574
1,7017365,"polster, brian m",university of maryland baltimore,baltimore,md,united states,212011508
5,1866930,"so, peter t",massachusetts institute of technology,cambridge,ma,united states,21421029
7,11044822,"gade, terence p",university of pennsylvania,philadelphia,pa,united states,191046205
8,9335858,"qi, ling",cornell university,ithaca,ny,united states,148502820


In [351]:
multi_pi['contact'] = multi_pi['pi_ids'].str.contains('contact')
in_pi_unique = multi_stripped['pi_ids'].isin(pi_unique['pi_ids'])
multi_pi['unique_pi'] = in_pi_unique
multi_pi.head()

Unnamed: 0,pi_names,org_name,org_city,org_state,org_country,org_zipcode,pi_ids,contact,unique_pi
2,"walton, maureen a; zucker, robert alpert (cont...",university of michigan,ann arbor,mi,united states,481091276,7354078,False,True
2,"walton, maureen a; zucker, robert alpert (cont...",university of michigan,ann arbor,mi,united states,481091276,1862210 (contact),True,False
3,"dulac, catherine g (contact); regev, aviv ; zh...",harvard university,cambridge,ma,united states,21385369,2275890 (contact),True,True
3,"dulac, catherine g (contact); regev, aviv ; zh...",harvard university,cambridge,ma,united states,21385369,8742217,False,True
3,"dulac, catherine g (contact); regev, aviv ; zh...",harvard university,cambridge,ma,united states,21385369,6139020,False,True


In [337]:
#if unique_pi is false but contact is true
#store these values
#split pi_names
#keep only PIs with contact
#add new PI name/info to df

def strip_series(df, col_list, strip = '(contact)'):
    for col in col_list:
        df[col] = df[col].str.strip(strip)
    return df

unique_contact = multi_pi[(multi_pi['unique_pi'] == False) & (multi_pi['contact'] == True)]
unique_contact = split_rows_pis(unique_contact, 'pi_names')
unique_contact = unique_contact[unique_contact['pi_names'].str.contains('contact')]
unique_contact = strip_series(unique_contact, ['pi_ids', 'pi_names'])
#unique_contact.shape
unique_contact = unique_contact.drop_duplicates('pi_ids')
unique_contact.shape
unique_contact.head()

(4866, 9)

(4078, 9)

Unnamed: 0,org_name,org_city,org_state,org_country,org_zipcode,pi_ids,contact,unique_pi,pi_names
2,university of michigan,ann arbor,mi,united states,481091276,1862210,True,False,"zucker, robert alpert"
18,clarkson university,potsdam,ny,united states,136995630,10420866,True,False,"philpott, sean m."
19,university of hawaii at manoa,honolulu,hi,united states,968222234,12546171,True,False,"pirkle, catherine mclean"
39,university of virginia,charlottesville,va,united states,229044195,1866186,True,False,"brautigan, david l."
45,university of washington,seattle,wa,united states,981959472,1896877,True,False,"glenny, robb w"


In [366]:
#if unique_pi and contact are both false
#first store these rows; org info will already be contact person's org info
#split names, check if unique
#unique pi_names will be PIs who don't have solo grants and aren't the contact

##problem: for contacts, unique names could be isolated by (contact) string
#without contact, how to isolate individual names and match with their ID #?

#for each occurrence of 'pi_names':
#first set before ; assoc with pi_ids
##be careful...contact ids are not included, but contact names are included!

##do these need to be split? have the pi_ids, other info will be the same because not ever a contact

not_contact = multi_pi[(multi_pi['unique_pi'] == False) & (multi_pi['contact'] == False)]
not_contact = not_contact.drop_duplicates('pi_ids')
not_contact.head()

#Questions for analysis: how many grants are solo vs. joint? how many PIs have solo vs. joint grants?

Unnamed: 0,pi_names,org_name,org_city,org_state,org_country,org_zipcode,pi_ids,contact,unique_pi
6,"eden, uri tzvi; frank, loren m; ganguli, surya...",cold spring harbor laboratory,cold spring harbor,ny,united states,117242209,9851446,False,False
6,"eden, uri tzvi; frank, loren m; ganguli, surya...",cold spring harbor laboratory,cold spring harbor,ny,united states,117242209,2291297,False,False
6,"eden, uri tzvi; frank, loren m; ganguli, surya...",cold spring harbor laboratory,cold spring harbor,ny,united states,117242209,12280974,False,False
6,"eden, uri tzvi; frank, loren m; ganguli, surya...",cold spring harbor laboratory,cold spring harbor,ny,united states,117242209,10329759,False,False
6,"eden, uri tzvi; frank, loren m; ganguli, surya...",cold spring harbor laboratory,cold spring harbor,ny,united states,117242209,12572655,False,False
