# Cleaning PI Info
In order to prevent duplicate information used to train a model, store PI information separately from list of features.

**Eventually store as SQL database**

In [24]:
import pandas as pd
import numpy as np
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import cleaning_strings as cln
import importlib as imp
imp.reload(cln);

## Local functions 

In [25]:
def add_zero_zip(df, length, col1 = 'org_zipcode', col2 = 'org_country', country = 'united states'):
    '''
    
    Add leading zero to New England zip codes.
    
    '''
    zip_original = []
    zip_actual = []
    for zipcode in df.ix[(df[col1].str.len() == length) & (df[col2].str.contains(country))][col1]:
        zip_original.append(zipcode)
        zipcode = '0' + zipcode
        zip_actual.append(zipcode)
    return zip_original, zip_actual

def add_nih_info(df, col, replace_with, org_name = 'org_name', org_country = 'org_country'):
    '''
    
    Add NIH information.
    NIH investigators are those whose organization is NOT null,
    but whose country is null.
    
    '''
    df_replace = pd.DataFrame(df.ix[~df[org_name].isnull() & df[org_country].isnull()][col].replace(np.nan, replace_with))
    df_merged = pd.merge(df, df_replace, how = 'left', left_index=True, right_index=True, suffixes=('', '_copy'))
    df_merged[col].fillna(df_merged[col + '_copy'], inplace = True)
    return df_merged

## Basic text cleaning 

In [26]:
all_info = pd.read_csv('grants_all.csv', compression = 'gzip')
pi_info_raw = all_info['pi_ids pi_names org_name org_city org_state org_country org_zipcode'.split()]

In [27]:
pi_info_raw.head()

Unnamed: 0,pi_ids,pi_names,org_name,org_city,org_state,org_country,org_zipcode
0,10799126;,"scott, stuart alexander;",icahn school of medicine at mount sinai,new york,ny,united states,100296574
1,7017365;,"polster, brian m;",university of maryland baltimore,baltimore,md,united states,212011508
2,7354078; 1862210 (contact);,"walton, maureen a; zucker, robert alpert (cont...",university of michigan,ann arbor,mi,united states,481091276
3,2275890 (contact); 8742217; 6139020;,"dulac, catherine g (contact); regev, aviv ; zh...",harvard university,cambridge,ma,united states,21385369
4,9868481; 2063694 (contact);,"chen, xuesong ; geiger, jonathan david (contact);",university of north dakota,grand forks,nd,united states,582026059


In [28]:
pi_info_cleaned = cln.strip_df(pi_info_raw, ' ', ';', '.')

In [29]:
pi_info_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71872 entries, 0 to 71871
Data columns (total 7 columns):
pi_ids         71872 non-null object
pi_names       71872 non-null object
org_name       71766 non-null object
org_city       68527 non-null object
org_state      67740 non-null object
org_country    68528 non-null object
org_zipcode    68363 non-null object
dtypes: object(7)
memory usage: 3.8+ MB


In [30]:
pi_info_cleaned.head()

Unnamed: 0,pi_ids,pi_names,org_name,org_city,org_state,org_country,org_zipcode
0,10799126,"scott, stuart alexander",icahn school of medicine at mount sinai,new york,ny,united states,100296574
1,7017365,"polster, brian m",university of maryland baltimore,baltimore,md,united states,212011508
2,7354078; 1862210 (contact),"walton, maureen a; zucker, robert alpert (cont...",university of michigan,ann arbor,mi,united states,481091276
3,2275890 (contact); 8742217; 6139020,"dulac, catherine g (contact); regev, aviv ; zh...",harvard university,cambridge,ma,united states,21385369
4,9868481; 2063694 (contact),"chen, xuesong ; geiger, jonathan david (contact)",university of north dakota,grand forks,nd,united states,582026059


## Splitting PI Info
Some grants have multiple PIs listed on the grant, and the information for the group of PIs is listed only as the contact PI's information. In order to get unique PI information, these PI groups must be separated into individual PIs.

Split pi_info_cleaned into two dataframes, one containing grouped (multiple) PIs and one containing single PIs.

In [70]:
multi_pi_unsplit = pi_info_cleaned.ix[pi_info_cleaned['pi_ids'].str.contains('contact')]
pi_info = pi_info_cleaned.drop(multi_pi_unsplit.index)

Drop duplicates from unique PIs

In [71]:
pi_info.shape
pi_unique = pi_info.drop_duplicates('pi_ids')
pi_unique.shape
pi_unique.head()

(62262, 7)

(40104, 7)

Unnamed: 0,pi_ids,pi_names,org_name,org_city,org_state,org_country,org_zipcode
0,10799126,"scott, stuart alexander",icahn school of medicine at mount sinai,new york,ny,united states,100296574
1,7017365,"polster, brian m",university of maryland baltimore,baltimore,md,united states,212011508
5,1866930,"so, peter t",massachusetts institute of technology,cambridge,ma,united states,21421029
7,11044822,"gade, terence p",university of pennsylvania,philadelphia,pa,united states,191046205
8,9335858,"qi, ling",cornell university,ithaca,ny,united states,148502820


Split multiple PIs on PI ID.

In [72]:
multi_pi = cln.split_rows_pis(multi_pi_unsplit, 'pi_ids')
multi_pi.head()

Unnamed: 0,pi_names,org_name,org_city,org_state,org_country,org_zipcode,pi_ids
2,"walton, maureen a; zucker, robert alpert (cont...",university of michigan,ann arbor,mi,united states,481091276,7354078
2,"walton, maureen a; zucker, robert alpert (cont...",university of michigan,ann arbor,mi,united states,481091276,1862210 (contact)
3,"dulac, catherine g (contact); regev, aviv ; zh...",harvard university,cambridge,ma,united states,21385369,2275890 (contact)
3,"dulac, catherine g (contact); regev, aviv ; zh...",harvard university,cambridge,ma,united states,21385369,8742217
3,"dulac, catherine g (contact); regev, aviv ; zh...",harvard university,cambridge,ma,united states,21385369,6139020


Create a df where the '(contact)' string has been stripped from pi_ids; this will allow identification of unique PI IDs (otherwise an ID with '(contact)' appended at the end is viewed as a unique string).

In [73]:
multi_stripped = multi_pi.copy()
multi_stripped['pi_ids'] = multi_stripped['pi_ids'].str.strip(' (contact)')
multi_stripped.head()

Unnamed: 0,pi_names,org_name,org_city,org_state,org_country,org_zipcode,pi_ids
2,"walton, maureen a; zucker, robert alpert (cont...",university of michigan,ann arbor,mi,united states,481091276,7354078
2,"walton, maureen a; zucker, robert alpert (cont...",university of michigan,ann arbor,mi,united states,481091276,1862210
3,"dulac, catherine g (contact); regev, aviv ; zh...",harvard university,cambridge,ma,united states,21385369,2275890
3,"dulac, catherine g (contact); regev, aviv ; zh...",harvard university,cambridge,ma,united states,21385369,8742217
3,"dulac, catherine g (contact); regev, aviv ; zh...",harvard university,cambridge,ma,united states,21385369,6139020


Create columns in split multi_pi df denoting whether the PI is the contact and whether the PI ID is already present in the df pi_unique (which contains all PIs that are solo authors of a grant). If the PI is already in pi_unique, we do not need to separate their information again.

In [74]:
multi_pi['contact'] = multi_pi['pi_ids'].str.contains('contact')
in_pi_unique = multi_stripped['pi_ids'].isin(pi_unique['pi_ids'])
multi_pi['unique_pi'] = in_pi_unique
multi_pi.head()

Unnamed: 0,pi_names,org_name,org_city,org_state,org_country,org_zipcode,pi_ids,contact,unique_pi
2,"walton, maureen a; zucker, robert alpert (cont...",university of michigan,ann arbor,mi,united states,481091276,7354078,False,True
2,"walton, maureen a; zucker, robert alpert (cont...",university of michigan,ann arbor,mi,united states,481091276,1862210 (contact),True,False
3,"dulac, catherine g (contact); regev, aviv ; zh...",harvard university,cambridge,ma,united states,21385369,2275890 (contact),True,True
3,"dulac, catherine g (contact); regev, aviv ; zh...",harvard university,cambridge,ma,united states,21385369,8742217,False,True
3,"dulac, catherine g (contact); regev, aviv ; zh...",harvard university,cambridge,ma,united states,21385369,6139020,False,True


One PI is always designated as a contact if there are multiple PIs listed on a grant. If the PI's ID was not in the df pi_unique, but is listed as a contact (that is, unique_pi == False but contact == True), then we can isolate their information and add this information to pi_unique.

In [75]:
unique_contact = multi_pi[(multi_pi['unique_pi'] == False) & (multi_pi['contact'] == True)]
unique_contact = cln.split_rows_pis(unique_contact, 'pi_names')
unique_contact = unique_contact[unique_contact['pi_names'].str.contains('contact')]
unique_contact = cln.strip_series(unique_contact, ['pi_ids', 'pi_names'])
#unique_contact.shape
unique_contact = unique_contact.drop_duplicates('pi_ids')
unique_contact.shape
unique_contact.head()

(4078, 9)

Unnamed: 0,org_name,org_city,org_state,org_country,org_zipcode,pi_ids,contact,unique_pi,pi_names
2,university of michigan,ann arbor,mi,united states,481091276,1862210,True,False,"zucker, robert alpert"
18,clarkson university,potsdam,ny,united states,136995630,10420866,True,False,"philpott, sean m."
19,university of hawaii at manoa,honolulu,hi,united states,968222234,12546171,True,False,"pirkle, catherine mclean"
39,university of virginia,charlottesville,va,united states,229044195,1866186,True,False,"brautigan, david l."
45,university of washington,seattle,wa,united states,981959472,1896877,True,False,"glenny, robb w"


If the PI ID is neither listed in unique_pi nor are they ever listed as a contact, then we cannot identify whether their organization information is actually different from the contact PI's information. These names will therefore not be split (the IDs are already split).

**Note:** When doing analysis from multiple years, PI IDs should be cross-referenced across years in case a PI did have a solo grant in one year but not in others.

In [76]:
not_contact = multi_pi[(multi_pi['unique_pi'] == False) & (multi_pi['contact'] == False)]
not_contact = not_contact.drop_duplicates('pi_ids')
not_contact.head()

Unnamed: 0,pi_names,org_name,org_city,org_state,org_country,org_zipcode,pi_ids,contact,unique_pi
6,"eden, uri tzvi; frank, loren m; ganguli, surya...",cold spring harbor laboratory,cold spring harbor,ny,united states,117242209,9851446,False,False
6,"eden, uri tzvi; frank, loren m; ganguli, surya...",cold spring harbor laboratory,cold spring harbor,ny,united states,117242209,2291297,False,False
6,"eden, uri tzvi; frank, loren m; ganguli, surya...",cold spring harbor laboratory,cold spring harbor,ny,united states,117242209,12280974,False,False
6,"eden, uri tzvi; frank, loren m; ganguli, surya...",cold spring harbor laboratory,cold spring harbor,ny,united states,117242209,10329759,False,False
6,"eden, uri tzvi; frank, loren m; ganguli, surya...",cold spring harbor laboratory,cold spring harbor,ny,united states,117242209,12572655,False,False


Join all dfs containing unique PI ids.

In [77]:
#Reorder columns so joins can be performed correctly
cols1 = unique_contact.columns.tolist()
cols1 = cols1[-4:-3] + cols1[-1:] + cols1[0:5]
unique_1 = unique_contact[cols1]

cols2 = not_contact.columns.tolist()
cols2 = cols2[-3:-2] + cols2[0:6]
unique_2 = not_contact[cols2]

unique_multi = unique_1.append(unique_2)
#unique_multi.ix[unique_multi['pi_ids'].str.contains('contact')]
unique_multi = unique_multi.drop_duplicates('pi_ids')
unique_multi.shape

pi_unique.shape
pi_unique = pi_unique.append(unique_multi)

pi_unique = cln.strip_series(pi_unique, ['pi_ids'], strip = ' ')
pi_unique = pi_unique.drop_duplicates('pi_ids')
pi_unique.shape

(9686, 7)

(40104, 7)

(48753, 7)

## Fixing zipcodes
The leading zero of New England zipcodes was dropped in the raw data. Add leading zero to zipcodes from the US that are length 8 or 4.

In [78]:
pi_unique['org_zipcode'].str.len().value_counts()

9.0     39599
8.0      6409
5.0       483
7.0       295
4.0       233
6.0        65
3.0        54
1.0        37
10.0        3
2.0         3
Name: org_zipcode, dtype: int64

In [79]:
zip_8, zip_9 = add_zero_zip(pi_unique, 8.0)
zip_4, zip_5 = add_zero_zip(pi_unique, 4.0)

In [80]:
pi_unique['org_zipcode'] = pi_unique['org_zipcode'].replace(zip_8, zip_9)
pi_unique['org_zipcode'] = pi_unique['org_zipcode'].replace(zip_4, zip_5)

In [81]:
pi_unique['org_zipcode'].str.len().value_counts()

9.0     45953
5.0       517
7.0       295
4.0       199
6.0        65
8.0        55
3.0        54
1.0        37
10.0        3
2.0         3
Name: org_zipcode, dtype: int64

## Examining PIs from non-US countries

In [85]:
pi_unique.org_country.value_counts()

united states     46464
canada              123
south africa        119
united kingdom       74
uganda               39
ethiopia             30
switzerland          28
kenya                28
australia            22
nigeria              21
zambia               20
india                19
germany              19
mozambique           18
france               18
tanzania u rep       18
peru                 16
thailand             15
china                14
brazil               14
ghana                12
botswana             12
mali                 11
zimbabwe             10
argentina            10
malawi                9
netherlands           8
vietnam               8
cote d'ivoire         8
israel                8
                  ...  
belgium               2
georgia               2
iceland               2
suriname              2
costa rica            2
senegal               2
hong kong             2
japan                 2
trinidad/toba         1
guinea-bissau         1
indonesia       

Although most PIs are from the US, there are a significant number not from the US.

In [82]:
pi_unique.ix[pi_unique['org_country'] != 'united states']

Unnamed: 0,pi_ids,pi_names,org_name,org_city,org_state,org_country,org_zipcode
9,14380202,"boxer, matthew",translational science,,,,
10,10687426,"demner-fushman, dina",national library of medicine,,,,
11,9691085,"lakatta, edward",aging,,,,
16,10271451,"brewer, carmen crowell",deafness & other communication disorders,,,,
31,6569943,"kachar, bechara",deafness & other communication disorders,,,,
56,9692423,"warren, katherine e",basic sciences,,,,
59,12394158,"roberts, claire",university of adelaide,adelaide,,australia,5005
63,6480161,"neumann, ronald",clinical center,,,,
68,14282131,"momenan, abdolreza",alcohol abuse and alcoholism,,,,
86,12250037,"kabalo, abel n",eastern provincial health office,chipata,,zambia,10101


There are many PIs where the country is not listed.

In [86]:
null_country = pi_unique.ix[pi_unique['org_country'].isnull()]
null_country['org_name'].value_counts(dropna = False)

basic sciences                                           301
niaid extramural activities                              148
diabetes, digestive, kidney diseases                     102
child health and human development                        89
heart, lung, and blood institute                          81
environmental health sciences                             81
neurological disorders and stroke                         57
clinical center                                           57
aging                                                     51
national institute of mental health                       51
cancer epidemiology and genetics                          46
national eye institute                                    44
human genome research                                     37
national institute on drug abuse                          33
dental & craniofacial  research                           29
translational science                                     27
deafness & other communi

All the listed organization names are associated with the NIH, including omnitec solutions, therefore NIH's information can be added to these PIs.

What about those PIs with no listed organization?

In [87]:
null_country.ix[null_country['org_name'].isnull()]

Unnamed: 0,pi_ids,pi_names,org_name,org_city,org_state,org_country,org_zipcode
720,12426198,"stevens, rick",,,,,
1362,14368787,"singer, barbara",,,,,
4378,12377190,"rozhko, alexander",,,,,
5753,12627838,"mash, deborah",,,,,
9741,14767866,"quevedo, celia",,,,,
17838,14164955,"dawson, michael j.",,,,,
21377,12674785,"anderson, paul",,,,,
29673,12674789,"young, steven",,,,,
34946,14600584,"kiernan, sheryl",,,,,
37811,14745788,"plotkin, melissa",,,,,


All grants with no listed country and no listed organization are of the sbir/sttr contract form. There are only 25 total with no information, however, they account for a non-trivial amount of money, so will keep these 25 observations.

## Adding information for PIs at the NIH

NIH city, state, country and zipcode information is not listed (listed as NaN). Add these as bethesda, md, united states and 20892, respectively. **Note:** This address may not be exact, as some NIH institutes/centers may be located elsewhere, but this information will represent general NIH information.

In addition to NIH institutes, there are 20 entries with a PI ID where no information is listed.

In [88]:
pi_unique = add_nih_info(pi_unique, col = 'org_city', replace_with = 'bethesda')
pi_unique = add_nih_info(pi_unique, col = 'org_state', replace_with = 'md')
pi_unique = add_nih_info(pi_unique, col = 'org_zipcode', replace_with = '20892')
pi_unique = add_nih_info(pi_unique, col = 'org_country', replace_with = 'united states')

In [89]:
pi_unique.head(20)

Unnamed: 0,pi_ids,pi_names,org_name,org_city,org_state,org_country,org_zipcode,org_city_copy,org_state_copy,org_zipcode_copy,org_country_copy
0,10799126,"scott, stuart alexander",icahn school of medicine at mount sinai,new york,ny,united states,100296574,,,,
1,7017365,"polster, brian m",university of maryland baltimore,baltimore,md,united states,212011508,,,,
2,1862210,"zucker, robert alpert",university of michigan,ann arbor,mi,united states,481091276,,,,
5,1866930,"so, peter t",massachusetts institute of technology,cambridge,ma,united states,21421029,,,,
6,9851446,"eden, uri tzvi; frank, loren m; ganguli, surya...",cold spring harbor laboratory,cold spring harbor,ny,united states,117242209,,,,
6,12280974,"eden, uri tzvi; frank, loren m; ganguli, surya...",cold spring harbor laboratory,cold spring harbor,ny,united states,117242209,,,,
6,10329759,"eden, uri tzvi; frank, loren m; ganguli, surya...",cold spring harbor laboratory,cold spring harbor,ny,united states,117242209,,,,
6,12572655,"eden, uri tzvi; frank, loren m; ganguli, surya...",cold spring harbor laboratory,cold spring harbor,ny,united states,117242209,,,,
6,11706552,"eden, uri tzvi; frank, loren m; ganguli, surya...",cold spring harbor laboratory,cold spring harbor,ny,united states,117242209,,,,
7,11044822,"gade, terence p",university of pennsylvania,philadelphia,pa,united states,191046205,,,,


In [90]:
to_keep = [col for col in pi_unique.columns if '_copy' not in col]
pi_unique = pi_unique[to_keep]
#pi_unique.filter(regex = '_copy')

## Final information 

In [91]:
pi_unique.head(20)

Unnamed: 0,pi_ids,pi_names,org_name,org_city,org_state,org_country,org_zipcode
0,10799126,"scott, stuart alexander",icahn school of medicine at mount sinai,new york,ny,united states,100296574
1,7017365,"polster, brian m",university of maryland baltimore,baltimore,md,united states,212011508
2,1862210,"zucker, robert alpert",university of michigan,ann arbor,mi,united states,481091276
5,1866930,"so, peter t",massachusetts institute of technology,cambridge,ma,united states,21421029
6,9851446,"eden, uri tzvi; frank, loren m; ganguli, surya...",cold spring harbor laboratory,cold spring harbor,ny,united states,117242209
6,12280974,"eden, uri tzvi; frank, loren m; ganguli, surya...",cold spring harbor laboratory,cold spring harbor,ny,united states,117242209
6,10329759,"eden, uri tzvi; frank, loren m; ganguli, surya...",cold spring harbor laboratory,cold spring harbor,ny,united states,117242209
6,12572655,"eden, uri tzvi; frank, loren m; ganguli, surya...",cold spring harbor laboratory,cold spring harbor,ny,united states,117242209
6,11706552,"eden, uri tzvi; frank, loren m; ganguli, surya...",cold spring harbor laboratory,cold spring harbor,ny,united states,117242209
7,11044822,"gade, terence p",university of pennsylvania,philadelphia,pa,united states,191046205


In [92]:
pi_unique.to_csv('pi_info.csv', index = False, compression = 'gzip')