# Cleaning PI Info
In order to prevent duplicate information used to train a model, store PI information separately from list of features.

**Eventually store as SQL database**

In [1]:
import pandas as pd
import numpy as np
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import cleaning_strings as cln
import importlib as imp
imp.reload(cln);

## Local functions 

In [2]:
def add_zero_zip(df, length, col1 = 'org_zipcode', col2 = 'org_country', country = 'united states'):
    
    '''Add leading zero to New England zip codes.'''
    
    zip_original = []
    zip_actual = []
    for zipcode in df.ix[(df[col1].str.len() == length) & (df[col2].str.contains(country))][col1]:
        zip_original.append(zipcode)
        zipcode = '0' + zipcode
        zip_actual.append(zipcode)
    return zip_original, zip_actual

def add_nih_info(df, col, replace_with, org_name = 'org_name', org_country = 'org_country'):
    '''
    
    Add NIH information.
    NIH investigators are those whose organization is NOT null,
    but whose country is null.
    
    '''
    df_replace = pd.DataFrame(df.ix[~df[org_name].isnull() & df[org_country].isnull()][col].replace(np.nan, replace_with))
    df_merged = pd.merge(df, df_replace, how = 'left', left_index=True, right_index=True, suffixes=('', '_copy'))
    df_merged[col].fillna(df_merged[col + '_copy'], inplace = True)
    return df_merged

## Cleaning PI information
We want a dataframe where each row is a single PI (no duplicates) and associated organization information as a cross-reference to the grants data.

Import relevant columns from csv with raw grant information.

In [60]:
columns = 'fy pi_ids pi_names org_name org_city org_state org_country org_zipcode org_duns'.split()
dtypes = {key: str for key in columns}

In [61]:
pi_info_raw = pd.read_csv('all_grants.csv', compression = 'gzip', usecols = columns, dtype = dtypes)
pi_info_raw.tail()

Unnamed: 0,fy,org_city,org_country,org_duns,org_name,org_state,org_zipcode,pi_ids,pi_names
2223287,2016,lawrence,united states,76248616,university of kansas lawrence,ks,660457568,9524770;,"mcgill, jodi l.;"
2223288,2016,davis,united states,47120084,university of california at davis,ca,956186153,6490459;,"clancy, colleen e;"
2223289,2016,la jolla,united states,804355790,university of california san diego,ca,920930934,1901669;,"feng, gen-sheng ;"
2223290,2016,coral gables,united states,52780918,university of miami school of medicine,fl,331462926,10944221;,"sharifai, nima ;"
2223291,2016,toledo,united states,51623734,university of toledo,oh,436063390,9288457;,"liu, song-tao ;"


In [63]:
pi_info_cleaned = cln.strip_df(pi_info_raw, ' ', ';', ' ', '.')
pi_info_cleaned.tail()

Unnamed: 0,fy,org_city,org_country,org_duns,org_name,org_state,org_zipcode,pi_ids,pi_names
2223287,2016,lawrence,united states,76248616,university of kansas lawrence,ks,660457568,9524770,"mcgill, jodi l"
2223288,2016,davis,united states,47120084,university of california at davis,ca,956186153,6490459,"clancy, colleen e"
2223289,2016,la jolla,united states,804355790,university of california san diego,ca,920930934,1901669,"feng, gen-sheng"
2223290,2016,coral gables,united states,52780918,university of miami school of medicine,fl,331462926,10944221,"sharifai, nima"
2223291,2016,toledo,united states,51623734,university of toledo,oh,436063390,9288457,"liu, song-tao"


In [64]:
pi_info_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2223292 entries, 0 to 2223291
Data columns (total 9 columns):
fy             object
org_city       object
org_country    object
org_duns       object
org_name       object
org_state      object
org_zipcode    object
pi_ids         object
pi_names       object
dtypes: object(9)
memory usage: 152.7+ MB


## Splitting PI Info
Some grants have multiple PIs listed on the grant, and the information for the group of PIs is listed only as the contact PI's information. In order to get unique PI information, these PI groups must be separated into individual PIs.

Split pi_info_cleaned into two dataframes, one containing grouped (multiple) PIs and one containing single PIs.

In [67]:
multi_pi_unsplit = pi_info_cleaned.ix[pi_info_cleaned['pi_ids'].str.contains('contact', na = False)]
pi_info = pi_info_cleaned.drop(multi_pi_unsplit.index)

Drop duplicates to obtain unique PIs that were awarded solo grants. Check both PI IDs and organization name to check for PIs that moved to a different institution.

In [68]:
pi_info.shape
pi_unique = pi_info.drop_duplicates(['pi_ids', 'org_duns'])
pi_unique.shape
pi_unique.tail()

(2174999, 9)

(415466, 9)

Unnamed: 0,fy,org_city,org_country,org_duns,org_name,org_state,org_zipcode,pi_ids,pi_names
2223248,2016,champaign,united states,41544081,university of illinois at urbana-champaign,il,618207473,2084901,"bolton, eric c"
2223251,2016,davis,united states,47120084,university of california at davis,ca,956186153,9856365,"juliano, celina"
2223255,2016,seattle,united states,605799469,university of washington,wa,981959472,11678618,"gerner, michael"
2223258,2016,omaha,united states,168559177,university of nebraska medical center,ne,681987835,1873357,"sanderson, sam d"
2223273,2016,hartford,united states,807853791,connecticut state dept of public health,ct,61061367,14753985,"gonsalves, lou"


Split multiple PIs on PI ID.

In [69]:
multi_pi = cln.split_rows(multi_pi_unsplit, 'pi_ids', by = ';')
multi_pi.tail()

Unnamed: 0,fy,org_city,org_country,org_duns,org_name,org_state,org_zipcode,pi_names,pi_ids
2223041,2016,atlanta,united states,66469933,emory university,ga,303224250,"jovanovic, tanja ; smith, alicia k. (contact)",8800131 (contact)
2223047,2016,new york,united states,41968306,new york university,ny,100122300,"pyle, anna marie; schlick, tamar (contact)",1885894
2223047,2016,new york,united states,41968306,new york university,ny,100122300,"pyle, anna marie; schlick, tamar (contact)",2414701 (contact)
2223241,2016,baltimore,united states,188435911,university of maryland baltimore,md,212011508,"kao, joseph pao yung; mayer, dirk (contact)",8080184
2223241,2016,baltimore,united states,188435911,university of maryland baltimore,md,212011508,"kao, joseph pao yung; mayer, dirk (contact)",9340781 (contact)


Create a df where the '(contact)' string has been stripped from pi_ids; this will allow identification of unique PI IDs (otherwise an ID with '(contact)' appended at the end is viewed as a unique string).

In [70]:
multi_stripped = multi_pi.copy()
multi_stripped['pi_ids'] = multi_stripped['pi_ids'].str.strip(' (contact)')
multi_stripped.tail()

Unnamed: 0,fy,org_city,org_country,org_duns,org_name,org_state,org_zipcode,pi_names,pi_ids
2223041,2016,atlanta,united states,66469933,emory university,ga,303224250,"jovanovic, tanja ; smith, alicia k. (contact)",8800131
2223047,2016,new york,united states,41968306,new york university,ny,100122300,"pyle, anna marie; schlick, tamar (contact)",1885894
2223047,2016,new york,united states,41968306,new york university,ny,100122300,"pyle, anna marie; schlick, tamar (contact)",2414701
2223241,2016,baltimore,united states,188435911,university of maryland baltimore,md,212011508,"kao, joseph pao yung; mayer, dirk (contact)",8080184
2223241,2016,baltimore,united states,188435911,university of maryland baltimore,md,212011508,"kao, joseph pao yung; mayer, dirk (contact)",9340781


Create columns in split multi_pi df denoting whether the PI is the contact and whether the PI ID is already present in the df pi_unique (which contains all PIs that are solo authors of a grant). If the PI is already in pi_unique, we do not need to separate their information again.

In [71]:
multi_pi['contact'] = multi_pi['pi_ids'].str.contains('contact', na = False)
in_pi_unique = multi_stripped['pi_ids'].isin(pi_unique['pi_ids'])
multi_pi['unique_pi'] = in_pi_unique
multi_pi.tail()

Unnamed: 0,fy,org_city,org_country,org_duns,org_name,org_state,org_zipcode,pi_names,pi_ids,contact,unique_pi
2223041,2016,atlanta,united states,66469933,emory university,ga,303224250,"jovanovic, tanja ; smith, alicia k. (contact)",8800131 (contact),True,True
2223047,2016,new york,united states,41968306,new york university,ny,100122300,"pyle, anna marie; schlick, tamar (contact)",1885894,False,True
2223047,2016,new york,united states,41968306,new york university,ny,100122300,"pyle, anna marie; schlick, tamar (contact)",2414701 (contact),True,True
2223241,2016,baltimore,united states,188435911,university of maryland baltimore,md,212011508,"kao, joseph pao yung; mayer, dirk (contact)",8080184,False,True
2223241,2016,baltimore,united states,188435911,university of maryland baltimore,md,212011508,"kao, joseph pao yung; mayer, dirk (contact)",9340781 (contact),True,True


One PI is always designated as a contact if there are multiple PIs listed on a grant. If the PI's ID was not in the df pi_unique, but is listed as a contact (that is, unique_pi == False but contact == True), then we can isolate their information and add this information to pi_unique.

In [72]:
unique_contact = multi_pi[(multi_pi['unique_pi'] == False) & (multi_pi['contact'] == True)]
unique_contact = cln.split_rows(unique_contact, 'pi_names', ';')
unique_contact = unique_contact[unique_contact['pi_names'].str.contains('contact')]
unique_contact = cln.strip_series(unique_contact, ['pi_ids', 'pi_names'])

#shape before dropping duplicates
unique_contact.shape
unique_contact = unique_contact.drop_duplicates('pi_ids org_duns'.split())

#shape after dropping duplicates
unique_contact.shape
unique_contact.tail()

(6083, 11)

(2903, 11)

Unnamed: 0,fy,org_city,org_country,org_duns,org_name,org_state,org_zipcode,pi_ids,contact,unique_pi,pi_names
2221142,2016,washington,united states,72643117,american society for microbiology,dc,200362904,8632984,True,False,"chang, amy lee"
2221202,2016,tucson,united states,79416826,"synactix pharmaceuticals, inc",az,857182014,10738456,True,False,"li, hong-yu"
2221747,2016,rochester,united states,6471700,mayo clinic rochester,mn,559050001,9965561,True,False,"pereira, naveen luke"
2221778,2016,baltimore,united states,78748558,"elixirgen, llc",md,212051511,11903165,True,False,"mano, tomokazu"
2222666,2016,boston,united states,73130411,massachusetts general hospital,ma,21142696,1902302,True,False,"cahill, daniel p"


If the PI ID is neither listed in unique_pi nor are they ever listed as a contact, then we cannot identify whether their organization information is actually different from the contact PI's information. These names will therefore not be split (the IDs are already split).

**Note:** When doing analysis from multiple years, PI IDs should be cross-referenced across years in case a PI did have a solo grant in one year but not in others.

In [73]:
not_contact = multi_pi[(multi_pi['unique_pi'] == False) & (multi_pi['contact'] == False)]
not_contact = not_contact.drop_duplicates('pi_ids org_duns'.split())
not_contact.shape
not_contact.tail()

(5963, 11)

Unnamed: 0,fy,org_city,org_country,org_duns,org_name,org_state,org_zipcode,pi_names,pi_ids,contact,unique_pi
2222255,2016,aurora,united states,41096314,university of colorado denver,co,800452571,"d'sa, viren ; deoni, sean cl (contact); hobbin...",12291424,False,False
2222255,2016,aurora,united states,41096314,university of colorado denver,co,800452571,"d'sa, viren ; deoni, sean cl (contact); hobbin...",6667032,False,False
2222255,2016,aurora,united states,41096314,university of colorado denver,co,800452571,"d'sa, viren ; deoni, sean cl (contact); hobbin...",6870571,False,False
2222458,2016,providence,united states,1785542,brown university,ri,29129002,"chan, philip andrew; mena, leandro antonio; nu...",8556604,False,False
2222666,2016,boston,united states,73130411,massachusetts general hospital,ma,21142696,"brastianos, priscilla kaliopi; cahill, daniel ...",11213017,False,False


Join all dfs containing unique PI ids.

In [74]:
#Reorder columns so joins can be performed correctly

#unique PIs with a solo grant
cols1 = unique_contact.columns.tolist()
cols1 = cols1[-4:-3] + cols1[-1:] + cols1[0:5]
unique_1 = unique_contact[cols1]

#unique PIs that neither have a solor grant nor are listed as a contact
cols2 = not_contact.columns.tolist()
cols2 = cols2[-3:-2] + cols2[0:6]
unique_2 = not_contact[cols2]

#unique PIs that do not have solo grants but have a
unique_multi = unique_1.append(unique_2)
unique_multi = unique_multi.drop_duplicates('pi_ids org_name'.split())
unique_multi.shape

pi_unique.shape
pi_unique = pi_unique.append(unique_multi)

pi_unique = cln.strip_series(pi_unique, ['pi_ids'], strip = ' ')
pi_unique = pi_unique.drop_duplicates('pi_ids org_name'.split())
pi_unique.shape

(7996, 8)

(415466, 9)

(352523, 9)

## Fixing zipcodes
The leading zero of New England zipcodes was dropped in the raw data. Add leading zero to zipcodes from the US that are length 8 or 4.

In [75]:
pi_unique['org_zipcode'].str.len().value_counts()

9.0     158682
5.0     138823
8.0       9030
4.0       3892
7.0       1478
6.0        305
3.0        156
1.0         58
2.0         13
10.0        11
Name: org_zipcode, dtype: int64

In [76]:
zip_8, zip_9 = add_zero_zip(pi_unique, 8.0)
zip_4, zip_5 = add_zero_zip(pi_unique, 4.0)

In [77]:
pi_unique['org_zipcode'] = pi_unique['org_zipcode'].replace(zip_8, zip_9)
pi_unique['org_zipcode'] = pi_unique['org_zipcode'].replace(zip_4, zip_5)

In [78]:
pi_unique['org_zipcode'].str.len().value_counts()

9.0     167556
5.0     142068
7.0       1478
4.0        647
6.0        305
3.0        156
8.0        156
1.0         58
2.0         13
10.0        11
Name: org_zipcode, dtype: int64

## Cleaning DUNS
All organization DUNS should be 9 digits long, although some entries have multiple DUNS listed.

In [79]:
pi_unique['org_duns'].str.len().value_counts()

9.0     196290
8.0      39066
7.0       9993
20.0       367
19.0        59
Name: org_duns, dtype: int64

In [80]:
pi_unique.ix[pi_unique['org_duns'].str.len() == 8.0]

Unnamed: 0,fy,org_city,org_country,org_duns,org_name,org_state,org_zipcode,pi_ids,pi_names
1,1985,berkeley,united states,94878337,university of california berkeley,ca,947045940,2407264,"lashof, joyce c"
4,1985,san diego,united states,73371346,san diego state university,ca,92182,1957769,"scutchfield, f douglas"
5,1985,new haven,united states,82359691,yale university,ct,065208047,1968150,"stolwijk, jan a"
9,1985,baltimore,united states,45911138,johns hopkins university,md,21218,2405793,"schoenrich, edyth h"
10,1985,boston,united states,82359691,harvard university (medical school),ma,02115,3700005,"blout, eklan r"
15,1985,new york,united states,64931884,columbia univ new york morningside,ny,100277003,3700008,"weiss, robert j"
16,1985,chapel hill,united states,78861598,university of north carolina chapel hill,nc,27599,1873027,"ibrahim, michel a"
18,1985,pittsburgh,united states,53785812,university of pittsburgh at pittsburgh,pa,15213,3700010,"peterson, karen s"
23,1985,newark,united states,59007500,university of delaware,de,19716,3700089,"anderson, edith h"
25,1985,buffalo,united states,38633251,state university of new york at buffalo,ny,14260,3700100,"bullough, bonnie"


In [81]:
pi_unique.ix[pi_unique['org_duns'].str.len() == 7.0]

Unnamed: 0,fy,org_city,org_country,org_duns,org_name,org_state,org_zipcode,pi_ids,pi_names
0,1985,birmingham,united states,4514360,university of alabama at birmingham,al,35294,3700006,"bridgers, william f"
28,1985,birmingham,united states,4514360,university of alabama at birmingham,al,35294,2416546,"o'koren, marie l"
35,1985,baltimore,united states,3255213,university of maryland baltimore,md,212011508,1990219,"holt, frieda m"
50,1985,indianapolis,united states,5436803,indiana univ-purdue univ at indianapolis,in,462025167,3700186,"froebe, doris j"
51,1985,new york,united states,4514360,new york university,ny,100122331,1968619,"winstead-fry, patricia e"
68,1985,portland,united states,9584210,oregon health and science university,or,972393098,3700269,"boyd, sherry t"
74,1985,nashville,united states,4413456,vanderbilt university,tn,372036869,3173562,"conway-welch, colleen"
100,1985,charlottesville,united states,1910777,university of virginia charlottesville,va,229044195,2402920,"brodie, barbara m"
112,1985,hampton,united states,3135068,hampton university,va,23668,1875785,"daniel, elnora d"
152,1985,chicago,united states,5436803,northwestern university,il,60611,2081822,"davis, lucille"


According to NIH's description, multiple DUNS are separated by a semi-colon. There are only two groups where multiple DUNS are listed, those with 19 characters and those with 20 characters.

In [83]:
pi_unique.ix[pi_unique['org_duns'].str.len() == 19.0]

Unnamed: 0,fy,org_city,org_country,org_duns,org_name,org_state,org_zipcode,pi_ids,pi_names
1669155,2010,augusta,united states,809593387;966668691,georgia regents university,ga,309124810.0,9437629.0,"risher, william christopher"
1669167,2010,augusta,united states,809593387;966668691,georgia regents university,ga,309124810.0,9699188.0,"kelly-cobbs, aisha imani"
1669753,2010,augusta,united states,809593387;966668691,georgia regents university,ga,309124810.0,10123038.0,"rafikova, olga"
1671078,2010,augusta,united states,809593387;966668691,georgia regents university,ga,309124810.0,8815155.0,"wu, wei-hua e"
1671793,2010,augusta,united states,809593387;966668691,georgia regents university,ga,309124810.0,6225753.0,"kohan, donald e"
1672451,2010,augusta,united states,809593387;966668691,georgia regents university,ga,309124810.0,2275110.0,"bieberich, erhard"
1672485,2010,augusta,united states,809593387;966668691,georgia regents university,ga,309124810.0,9360980.0,"kim, jimok"
1673577,2010,augusta,united states,809593387;966668691,georgia regents university,ga,309124810.0,7726328.0,"rempala, grzegorz a"
1674423,2010,augusta,united states,809593387;966668691,georgia regents university,ga,309124810.0,1861006.0,"ganapathy, vadivel"
1675135,2010,augusta,united states,809593387;966668691,georgia regents university,ga,309124810.0,2654991.0,"didion, sean p"


In [84]:
pi_unique.ix[pi_unique['org_duns'].str.len() == 20.0]

Unnamed: 0,fy,org_city,org_country,org_duns,org_name,org_state,org_zipcode,pi_ids,pi_names
1846017,2012,augusta,united states,809593387; 966668691,georgia regents university,ga,309120004.0,9470515.0,"pacholczyk, rafal wojciech"
1846028,2012,augusta,united states,809593387; 966668691,georgia regents university,ga,309120004.0,6491945.0,"dong, zheng"
1846522,2012,augusta,united states,809593387; 966668691,georgia regents university,ga,309120004.0,10353675.0,"loria, analia"
1846817,2012,augusta,united states,809593387; 966668691,georgia regents university,ga,309120004.0,2091767.0,"mellor, andrew lee"
1847006,2012,lubbock,united states,609980727; 962545658,texas tech university health scis center,tx,794306271.0,10645764.0,"price, roderick"
1847293,2012,augusta,united states,809593387; 966668691,georgia regents university,ga,309120004.0,10662293.0,"bollinger, kathryn"
1847807,2012,augusta,united states,809593387; 966668691,georgia regents university,ga,309120004.0,1866163.0,"isales, carlos miguel"
1848338,2012,eugene,united states,079289626; 948117312,university of oregon,or,974035219.0,10341194.0,"conery, john"
1848339,2012,augusta,united states,809593387; 966668691,georgia regents university,ga,309120004.0,1882582.0,"yu, robert k"
1849369,2012,augusta,united states,809593387; 966668691,georgia regents university,ga,309120004.0,6772735.0,"dhandapani, krishnan m"


For organizations where two DUNS are listed, it seems that both DUNS are used to identify the organization. The only difference between a DUNS with 20 characters and 19 characters is a space after the ';'. Add a space to the 19-length DUNS and replace these values in the df.

In [111]:
#Get the DUNS numbers from pi_unique where the length of the DUNS is 19
len19 = pi_unique.ix[pi_unique['org_duns'].str.len() == 19.0]['org_duns'].values.tolist()

#add a space after the semi-colon
len19_new = []
for i in range(len(len19)):
    len19_new.append(len19[i][:10] + ' ' + len19[i][10:])

#replace the original DUNS of 19 characters
pi_unique['org_duns'] = pi_unique['org_duns'].replace(len19, len19_new)

Remove duplicates by pi_id and org_duns.

In [117]:
pi_unique.shape
pi_unique = pi_unique.drop_duplicates('pi_ids org_duns'.split())
pi_unique.shape

(352476, 9)

(352476, 9)

## Examining PIs from non-US countries

In [118]:
pd.set_option('max_rows', 1000)
pi_unique.org_country.value_counts()

united states     346238
canada              1062
united kingdom       592
south africa         309
australia            272
switzerland          198
israel               186
china                146
france               120
uganda               113
sweden               112
germany              101
kenya                 99
india                 92
ethiopia              88
netherlands           83
zambia                81
tanzania u rep        78
brazil                69
italy                 68
denmark               67
zimbabwe              51
thailand              51
nigeria               48
peru                  47
argentina             45
finland               43
vietnam               41
malawi                40
belgium               39
mozambique            39
rwanda                38
mexico                38
cote d'ivoire         34
haiti                 30
botswana              29
fed micronesia        29
new zealand           29
trinidad/toba         28
colombia              27


Although most PIs are from the US, there are a significant number not from the US, and some countries are not listed.

In [119]:
pi_unique.ix[pi_unique['org_country'] != 'united states']

Unnamed: 0,fy,org_city,org_country,org_duns,org_name,org_state,org_zipcode,pi_ids,pi_names
790,1985,kolonia,fed micronesia,,community college of micronesia,,96941,3703974,"jerry, timothy"
1725,1985,montreal,canada,,mcgill university,pq,h3a 2t5,1894983,"metherate, raju"
2032,1985,amsterdam,netherlands,,netherlands cancer institute,,1066 cx,1872424,"white, theodore c"
2107,1985,cambridge,united kingdom,226552610,university of cambridge,,cb2 1tn,1929741,"blazer-yost, bonnie l"
2114,1985,tokyo,japan,712739783,university of tokyo,,1130033,8659411,"collie, nathan l"
2155,1985,adelaide,australia,,flinders university of south australia,,5001,1862732,"galligan, james j"
2265,1985,cambridge,united kingdom,232560263,medical res council lab of molec biol,,cb2 2qh,1925731,"drew, horace r"
2275,1985,lyon,france,279551881,international agency for res on cancer,,69372,2045141,"becker, richard a"
2276,1985,lyon,france,279551881,international agency for res on cancer,,69372,2061193,"becker, richard a"
2297,1985,rehovot,israel,,weizmann institute of science,,76100,1965010,"dorsett, dale l"


In [120]:
null_country = pi_unique.ix[pi_unique['org_country'].isnull()]
null_country['org_name'].value_counts(dropna = False)

NaN                                                      156
basic sciences                                           150
niaid extramural activities                               52
heart, lung, and blood institute                          44
environmental health sciences                             39
clinical center                                           35
child health and human development                        32
translational science                                     29
cancer epidemiology and genetics                          25
diabetes, digestive, kidney diseases                      25
national eye institute                                    24
neurological disorders and stroke                         23
national institute of mental health                       21
human genome research                                     20
national institute on drug abuse                          19
aging                                                     18
national cancer institut

In [137]:
pi_unique.ix[pi_unique['org_name'].str.contains('american college of sports medicine', na = False)]

Unnamed: 0,fy,org_city,org_country,org_duns,org_name,org_state,org_zipcode,pi_ids,pi_names
824632,1999,indianapolis,united states,,american college of sports medicine,in,462023233.0,6069040,"terjung, ronald l"
1232453,2005,,,,american college of sports medicine foun,,,9102267,"durstine, larry"
1232454,2005,,,,american college of sports medicine,,,9102272,"chodzko-zajko, wojtek"
1232455,2005,,,,american college of sports medicine fnd,,,9102277,"durstine, j larry"
1476852,2007,indianapolis,united states,,american college of sports medicine,in,462023233.0,10355884,"whitehead, james r"


In [59]:
pi_unique['org_name'].value_counts(dropna = False)

NaN                                                               15998
johns hopkins university                                           5520
university of washington                                           4423
university of pennsylvania                                         4194
university of california san francisco                             4083
stanford university                                                4069
university of california los angeles                               3796
university of california san diego                                 3509
washington university                                              3473
duke university                                                    3423
yale university                                                    3394
university of pittsburgh at pittsburgh                             3159
university of colorado denver                                      2953
university of michigan at ann arbor                             

Below: no listed country or DUNS

In [123]:
null_country.ix[null_country['org_duns'].isnull()]

Unnamed: 0,fy,org_city,org_country,org_duns,org_name,org_state,org_zipcode,pi_ids,pi_names
940314,2001,,,,the johns hopkins university,,,1858936,"yu, xiao-fang"
943341,2001,,,,institut de genetique et biologie molecu,,,6937416,"auwerx, johan h"
989497,2001,,,,philadelphia health management corp,,,8996260,"lauby, jennifer l"
1001769,2001,,,,ethiopian health &nutrition res. inst,,,9099959,"gidey, tsehaynesh messele"
1002316,2001,,,,ministry of health,,,8990171,"opio, alex"
1002317,2001,,,,"ministry of public health, thailand",,,8990192,"boonyawongvirot, prat"
1002321,2001,,,,ministry of health,,,9069199,"thi minh, luu chau"
1008722,2002,,,,the congressional glaucoma caucus,,,8993844,"grant, stanley j"
1019142,2002,,,,new england research institute,,,6315186,"wright, elizabeth"
1020504,2002,,,,"alphavax, inc",,,6944198,"keith, paula m"


## Adding information for PIs at the NIH

NIH city, state, country and zipcode information is not listed (listed as NaN). Add these as bethesda, md, united states and 20892, respectively. **Note:** This address may not be exact, as some NIH institutes/centers may be located elsewhere, but this information will represent general NIH information.

In addition to NIH institutes, there are 20 entries with a PI ID where no information is listed.

In [88]:
pi_unique = add_nih_info(pi_unique, col = 'org_city', replace_with = 'bethesda')
pi_unique = add_nih_info(pi_unique, col = 'org_state', replace_with = 'md')
pi_unique = add_nih_info(pi_unique, col = 'org_zipcode', replace_with = '20892')
pi_unique = add_nih_info(pi_unique, col = 'org_country', replace_with = 'united states')

In [89]:
pi_unique.head(20)

Unnamed: 0,pi_ids,pi_names,org_name,org_city,org_state,org_country,org_zipcode,org_city_copy,org_state_copy,org_zipcode_copy,org_country_copy
0,10799126,"scott, stuart alexander",icahn school of medicine at mount sinai,new york,ny,united states,100296574,,,,
1,7017365,"polster, brian m",university of maryland baltimore,baltimore,md,united states,212011508,,,,
2,1862210,"zucker, robert alpert",university of michigan,ann arbor,mi,united states,481091276,,,,
5,1866930,"so, peter t",massachusetts institute of technology,cambridge,ma,united states,21421029,,,,
6,9851446,"eden, uri tzvi; frank, loren m; ganguli, surya...",cold spring harbor laboratory,cold spring harbor,ny,united states,117242209,,,,
6,12280974,"eden, uri tzvi; frank, loren m; ganguli, surya...",cold spring harbor laboratory,cold spring harbor,ny,united states,117242209,,,,
6,10329759,"eden, uri tzvi; frank, loren m; ganguli, surya...",cold spring harbor laboratory,cold spring harbor,ny,united states,117242209,,,,
6,12572655,"eden, uri tzvi; frank, loren m; ganguli, surya...",cold spring harbor laboratory,cold spring harbor,ny,united states,117242209,,,,
6,11706552,"eden, uri tzvi; frank, loren m; ganguli, surya...",cold spring harbor laboratory,cold spring harbor,ny,united states,117242209,,,,
7,11044822,"gade, terence p",university of pennsylvania,philadelphia,pa,united states,191046205,,,,


In [90]:
to_keep = [col for col in pi_unique.columns if '_copy' not in col]
pi_unique = pi_unique[to_keep]
#pi_unique.filter(regex = '_copy')

## Final information 

In [91]:
pi_unique.head(20)

Unnamed: 0,pi_ids,pi_names,org_name,org_city,org_state,org_country,org_zipcode
0,10799126,"scott, stuart alexander",icahn school of medicine at mount sinai,new york,ny,united states,100296574
1,7017365,"polster, brian m",university of maryland baltimore,baltimore,md,united states,212011508
2,1862210,"zucker, robert alpert",university of michigan,ann arbor,mi,united states,481091276
5,1866930,"so, peter t",massachusetts institute of technology,cambridge,ma,united states,21421029
6,9851446,"eden, uri tzvi; frank, loren m; ganguli, surya...",cold spring harbor laboratory,cold spring harbor,ny,united states,117242209
6,12280974,"eden, uri tzvi; frank, loren m; ganguli, surya...",cold spring harbor laboratory,cold spring harbor,ny,united states,117242209
6,10329759,"eden, uri tzvi; frank, loren m; ganguli, surya...",cold spring harbor laboratory,cold spring harbor,ny,united states,117242209
6,12572655,"eden, uri tzvi; frank, loren m; ganguli, surya...",cold spring harbor laboratory,cold spring harbor,ny,united states,117242209
6,11706552,"eden, uri tzvi; frank, loren m; ganguli, surya...",cold spring harbor laboratory,cold spring harbor,ny,united states,117242209
7,11044822,"gade, terence p",university of pennsylvania,philadelphia,pa,united states,191046205


In [92]:
pi_unique.to_csv('pi_info.csv', index = False, compression = 'gzip')