# Cleaning PI Info
In order to prevent duplicate information used to train a model, store PI information separately from list of features.

**Eventually store as SQL database**

In [128]:
import pandas as pd
import numpy as np
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import cleaning_strings as cln
import importlib as imp
imp.reload(cln);

## Local functions 

In [129]:
def add_zero_zip(df, length, col1 = 'org_zipcode', col2 = 'org_country', country = 'united states'):
    
    '''Add leading zero to New England zipcodes.'''
    
    original = []
    actual = []
    for num in df.ix[(df[col1].str.len() == length) & (df[col2].str.contains(country))][col1]:
        original.append(num)
        num = '0' + num
        actual.append(num)
    return original, actual

def add_nih_info(df, col, replace_with, org_name = 'org_name', org_country = 'org_country'):
    '''
    
    Add NIH information.
    NIH investigators are those whose organization is NOT null,
    but whose country is null.
    
    '''
    df_replace = pd.DataFrame(df.ix[~df[org_name].isnull() & df[org_country].isnull()][col].replace(np.nan, replace_with))
    df_merged = pd.merge(df, df_replace, how = 'left', left_index=True, right_index=True, suffixes=('', '_copy'))
    df_merged[col].fillna(df_merged[col + '_copy'], inplace = True)
    return df_merged

## Cleaning PI information
We want a dataframe where each row is a single PI (no duplicates) and associated organization information as a cross-reference to the grants data.

Import relevant columns from csv with raw grant information.

In [130]:
columns = 'fy pi_ids pi_names org_name org_city org_state org_country org_zipcode org_duns'.split()
dtypes = {key: str for key in columns}

In [131]:
pi_info_raw = pd.read_csv('all_grants.csv', compression = 'gzip', usecols = columns, dtype = dtypes)

#Only analyzing grants from 2000 onwards, as prior to that no funding information is available
pi_info_raw = pi_info_raw.ix[pi_info_raw['fy'] >= '2000']
pi_info_raw.tail()

Unnamed: 0,fy,org_city,org_country,org_duns,org_name,org_state,org_zipcode,pi_ids,pi_names
2223287,2016,lawrence,united states,76248616,university of kansas lawrence,ks,660457568,9524770;,"mcgill, jodi l.;"
2223288,2016,davis,united states,47120084,university of california at davis,ca,956186153,6490459;,"clancy, colleen e;"
2223289,2016,la jolla,united states,804355790,university of california san diego,ca,920930934,1901669;,"feng, gen-sheng ;"
2223290,2016,coral gables,united states,52780918,university of miami school of medicine,fl,331462926,10944221;,"sharifai, nima ;"
2223291,2016,toledo,united states,51623734,university of toledo,oh,436063390,9288457;,"liu, song-tao ;"


In [132]:
pi_info_cleaned = cln.strip_df(pi_info_raw, ' ', ';', ' ', '.')
pi_info_cleaned.tail()

Unnamed: 0,fy,org_city,org_country,org_duns,org_name,org_state,org_zipcode,pi_ids,pi_names
2223287,2016,lawrence,united states,76248616,university of kansas lawrence,ks,660457568,9524770,"mcgill, jodi l"
2223288,2016,davis,united states,47120084,university of california at davis,ca,956186153,6490459,"clancy, colleen e"
2223289,2016,la jolla,united states,804355790,university of california san diego,ca,920930934,1901669,"feng, gen-sheng"
2223290,2016,coral gables,united states,52780918,university of miami school of medicine,fl,331462926,10944221,"sharifai, nima"
2223291,2016,toledo,united states,51623734,university of toledo,oh,436063390,9288457,"liu, song-tao"


In [133]:
pi_info_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1383066 entries, 840226 to 2223291
Data columns (total 9 columns):
fy             1383066 non-null object
org_city       1325576 non-null object
org_country    1353507 non-null object
org_duns       1144656 non-null object
org_name       1377934 non-null object
org_state      1312652 non-null object
org_zipcode    1320858 non-null object
pi_ids         1379204 non-null object
pi_names       1379204 non-null object
dtypes: object(9)
memory usage: 105.5+ MB


## Splitting PI Info
Some grants have multiple PIs listed on the grant, and the information for the group of PIs is listed only as the contact PI's information. In order to get unique PI information, these PI groups must be separated into individual PIs.

Split pi_info_cleaned into two dataframes, one containing grouped (multiple) PIs and one containing single PIs.

In [134]:
multi_pi_unsplit = pi_info_cleaned.ix[pi_info_cleaned['pi_ids'].str.contains('contact', na = False)]
pi_info = pi_info_cleaned.drop(multi_pi_unsplit.index)

Drop duplicates to obtain unique PIs that were awarded solo grants. Check both PI IDs and organization name to check for PIs that moved to a different institution.

In [135]:
pi_info.shape
pi_unique = pi_info.drop_duplicates(['pi_ids', 'org_duns'])
pi_unique.shape
pi_unique.tail()

(1334773, 9)

(269473, 9)

Unnamed: 0,fy,org_city,org_country,org_duns,org_name,org_state,org_zipcode,pi_ids,pi_names
2223248,2016,champaign,united states,41544081,university of illinois at urbana-champaign,il,618207473,2084901,"bolton, eric c"
2223251,2016,davis,united states,47120084,university of california at davis,ca,956186153,9856365,"juliano, celina"
2223255,2016,seattle,united states,605799469,university of washington,wa,981959472,11678618,"gerner, michael"
2223258,2016,omaha,united states,168559177,university of nebraska medical center,ne,681987835,1873357,"sanderson, sam d"
2223273,2016,hartford,united states,807853791,connecticut state dept of public health,ct,61061367,14753985,"gonsalves, lou"


Split multiple PIs on PI ID.

In [136]:
multi_pi = cln.split_rows(multi_pi_unsplit, 'pi_ids', by = ';')
multi_pi.tail()

Unnamed: 0,fy,org_city,org_country,org_duns,org_name,org_state,org_zipcode,pi_names,pi_ids
2223041,2016,atlanta,united states,66469933,emory university,ga,303224250,"jovanovic, tanja ; smith, alicia k. (contact)",8800131 (contact)
2223047,2016,new york,united states,41968306,new york university,ny,100122300,"pyle, anna marie; schlick, tamar (contact)",1885894
2223047,2016,new york,united states,41968306,new york university,ny,100122300,"pyle, anna marie; schlick, tamar (contact)",2414701 (contact)
2223241,2016,baltimore,united states,188435911,university of maryland baltimore,md,212011508,"kao, joseph pao yung; mayer, dirk (contact)",8080184
2223241,2016,baltimore,united states,188435911,university of maryland baltimore,md,212011508,"kao, joseph pao yung; mayer, dirk (contact)",9340781 (contact)


Create a df where the '(contact)' string has been stripped from pi_ids; this will allow identification of unique PI IDs (otherwise an ID with '(contact)' appended at the end is viewed as a unique string).

In [137]:
multi_stripped = multi_pi.copy()
multi_stripped['pi_ids'] = multi_stripped['pi_ids'].str.strip(' (contact)')
multi_stripped.tail()

Unnamed: 0,fy,org_city,org_country,org_duns,org_name,org_state,org_zipcode,pi_names,pi_ids
2223041,2016,atlanta,united states,66469933,emory university,ga,303224250,"jovanovic, tanja ; smith, alicia k. (contact)",8800131
2223047,2016,new york,united states,41968306,new york university,ny,100122300,"pyle, anna marie; schlick, tamar (contact)",1885894
2223047,2016,new york,united states,41968306,new york university,ny,100122300,"pyle, anna marie; schlick, tamar (contact)",2414701
2223241,2016,baltimore,united states,188435911,university of maryland baltimore,md,212011508,"kao, joseph pao yung; mayer, dirk (contact)",8080184
2223241,2016,baltimore,united states,188435911,university of maryland baltimore,md,212011508,"kao, joseph pao yung; mayer, dirk (contact)",9340781


Create columns in split multi_pi df denoting whether the PI is the contact and whether the PI ID is already present in the df pi_unique (which contains all PIs that are solo authors of a grant). If the PI is already in pi_unique, we do not need to separate their information again.

In [138]:
multi_pi['contact'] = multi_pi['pi_ids'].str.contains('contact', na = False)
in_pi_unique = multi_stripped['pi_ids'].isin(pi_unique['pi_ids'])
multi_pi['unique_pi'] = in_pi_unique
multi_pi.tail()

Unnamed: 0,fy,org_city,org_country,org_duns,org_name,org_state,org_zipcode,pi_names,pi_ids,contact,unique_pi
2223041,2016,atlanta,united states,66469933,emory university,ga,303224250,"jovanovic, tanja ; smith, alicia k. (contact)",8800131 (contact),True,True
2223047,2016,new york,united states,41968306,new york university,ny,100122300,"pyle, anna marie; schlick, tamar (contact)",1885894,False,True
2223047,2016,new york,united states,41968306,new york university,ny,100122300,"pyle, anna marie; schlick, tamar (contact)",2414701 (contact),True,True
2223241,2016,baltimore,united states,188435911,university of maryland baltimore,md,212011508,"kao, joseph pao yung; mayer, dirk (contact)",8080184,False,True
2223241,2016,baltimore,united states,188435911,university of maryland baltimore,md,212011508,"kao, joseph pao yung; mayer, dirk (contact)",9340781 (contact),True,True


One PI is always designated as a contact if there are multiple PIs listed on a grant. If the PI's ID was not in the df pi_unique, but is listed as a contact (that is, unique_pi == False but contact == True), then we can isolate their information and add this information to pi_unique.

In [139]:
unique_contact = multi_pi[(multi_pi['unique_pi'] == False) & (multi_pi['contact'] == True)]
unique_contact = cln.split_rows(unique_contact, 'pi_names', ';')
unique_contact = unique_contact[unique_contact['pi_names'].str.contains('contact')]
unique_contact = cln.strip_series(unique_contact, ['pi_ids', 'pi_names'])

#shape before dropping duplicates
unique_contact.shape
unique_contact = unique_contact.drop_duplicates('pi_ids org_duns'.split())

#shape after dropping duplicates
unique_contact.shape
unique_contact.tail()

(6291, 11)

(3013, 11)

Unnamed: 0,fy,org_city,org_country,org_duns,org_name,org_state,org_zipcode,pi_ids,contact,unique_pi,pi_names
2221142,2016,washington,united states,72643117,american society for microbiology,dc,200362904,8632984,True,False,"chang, amy lee"
2221202,2016,tucson,united states,79416826,"synactix pharmaceuticals, inc",az,857182014,10738456,True,False,"li, hong-yu"
2221747,2016,rochester,united states,6471700,mayo clinic rochester,mn,559050001,9965561,True,False,"pereira, naveen luke"
2221778,2016,baltimore,united states,78748558,"elixirgen, llc",md,212051511,11903165,True,False,"mano, tomokazu"
2222666,2016,boston,united states,73130411,massachusetts general hospital,ma,21142696,1902302,True,False,"cahill, daniel p"


If the PI ID is neither listed in unique_pi nor are they ever listed as a contact, then we cannot identify whether their organization information is actually different from the contact PI's information. These names will therefore not be split (the IDs are already split).

**Note:** When doing analysis from multiple years, PI IDs should be cross-referenced across years in case a PI did have a solo grant in one year but not in others.

In [140]:
not_contact = multi_pi[(multi_pi['unique_pi'] == False) & (multi_pi['contact'] == False)]
not_contact = not_contact.drop_duplicates('pi_ids org_duns'.split())
not_contact.shape
not_contact.tail()

(6200, 11)

Unnamed: 0,fy,org_city,org_country,org_duns,org_name,org_state,org_zipcode,pi_names,pi_ids,contact,unique_pi
2222255,2016,aurora,united states,41096314,university of colorado denver,co,800452571,"d'sa, viren ; deoni, sean cl (contact); hobbin...",12291424,False,False
2222255,2016,aurora,united states,41096314,university of colorado denver,co,800452571,"d'sa, viren ; deoni, sean cl (contact); hobbin...",6667032,False,False
2222255,2016,aurora,united states,41096314,university of colorado denver,co,800452571,"d'sa, viren ; deoni, sean cl (contact); hobbin...",6870571,False,False
2222458,2016,providence,united states,1785542,brown university,ri,29129002,"chan, philip andrew; mena, leandro antonio; nu...",8556604,False,False
2222666,2016,boston,united states,73130411,massachusetts general hospital,ma,21142696,"brastianos, priscilla kaliopi; cahill, daniel ...",11213017,False,False


Join all dfs containing unique PI ids.

In [141]:
#Reorder columns so joins can be performed correctly

#unique PIs with a solo grant
cols1 = unique_contact.columns.tolist()
cols1 = cols1[-4:-3] + cols1[-1:] + cols1[0:5]
unique_1 = unique_contact[cols1]

#unique PIs that neither have a solor grant nor are listed as a contact
cols2 = not_contact.columns.tolist()
cols2 = cols2[-3:-2] + cols2[0:6]
unique_2 = not_contact[cols2]

#unique PIs that do not have solo grants but have a
unique_multi = unique_1.append(unique_2)
unique_multi = unique_multi.drop_duplicates('pi_ids org_name'.split())
unique_multi.shape

pi_unique.shape
pi_unique = pi_unique.append(unique_multi)

pi_unique = cln.strip_series(pi_unique, ['pi_ids'], strip = ' ')
pi_unique = pi_unique.drop_duplicates('pi_ids org_name'.split())
pi_unique.shape

(8298, 8)

(269473, 9)

(217465, 9)

## Fixing zipcodes
The leading zero of New England zipcodes was dropped in the raw data. Add leading zero to zipcodes from the US that are length 8 or 4.

In [142]:
pi_unique['org_zipcode'].str.len().value_counts()

9.0     116920
5.0      76249
8.0       7391
7.0       1016
4.0        701
6.0        261
3.0        104
1.0         58
10.0        11
2.0         10
Name: org_zipcode, dtype: int64

In [143]:
zip_8, zip_9 = add_zero_zip(pi_unique, 8.0)
zip_4, zip_5 = add_zero_zip(pi_unique, 4.0)

In [144]:
pi_unique['org_zipcode'] = pi_unique['org_zipcode'].replace(zip_8, zip_9)
pi_unique['org_zipcode'] = pi_unique['org_zipcode'].replace(zip_4, zip_5)

In [145]:
pi_unique['org_zipcode'].str.len().value_counts()

9.0     124180
5.0      76379
7.0       1016
4.0        571
6.0        261
8.0        131
3.0        104
1.0         58
10.0        11
2.0         10
Name: org_zipcode, dtype: int64

## DUNS numbers
The DUNS number should be a 9-digit number that uniquely identifies an organization. However, the DUNS numbers of organizations changed between 2008 to 2009.

In [154]:
pi_unique.ix[(pi_unique['org_name'].str.contains('stanford', na = False)) & ((pi_unique['fy'] == '2008') | (pi_unique['fy'] == '2009'))]

Unnamed: 0,fy,org_city,org_country,org_duns,org_name,org_state,org_zipcode,pi_ids,pi_names
1482486,2008,stanford,united states,800771545,stanford university,ca,943056203.0,7569301,"kesler, shelli r"
1482494,2008,stanford,united states,800771545,stanford university,ca,943056203.0,3121111,"bryant, zev"
1482881,2008,stanford,united states,800771545,stanford university,ca,943056203.0,9167595,"chang, catherine elizabeth"
1482892,2008,stanford,united states,800771545,stanford university,ca,943056203.0,9298617,"samanez larkin, gregory r"
1482897,2008,stanford,united states,800771545,stanford university,ca,943056203.0,9294648,"banko, max ryan"
1483451,2008,stanford,united states,800771545,stanford university,ca,943056203.0,8470072,"brown, justin e"
1483461,2008,stanford,united states,800771545,stanford university,ca,943056203.0,9155238,"hernandez, armando ricardo"
1483463,2008,stanford,united states,800771545,stanford university,ca,943056203.0,9201684,"pina, francisco javier"
1483505,2008,stanford,united states,800771545,stanford university,ca,943056203.0,7028019,"esparza, lourdes adriana"
1483675,2008,stanford,united states,800771545,stanford university,ca,943056203.0,8822430,"burrows, brittany"


In [156]:
pi_unique.ix[pi_unique['org_duns'] == '009214214']['org_name'].unique()

array(['burnham institute for medical research', 'stanford university'], dtype=object)

The DUNS number is also not unique between 2008 and 2009.

In [159]:
pi_unique.ix[(pi_unique['org_name'].str.contains('burnham', na = False)) & ((pi_unique['fy'] == '2008') | (pi_unique['fy'] == '2009'))]

Unnamed: 0,fy,org_city,org_country,org_duns,org_name,org_state,org_zipcode,pi_ids,pi_names
1482883,2008,la jolla,united states,9214214,burnham institute for medical research,ca,92037,9168920,"opaluch, amanda"
1484399,2008,la jolla,united states,9214214,burnham institute for medical research,ca,92037,8967204,"miletic sedy, ana"
1492092,2008,la jolla,united states,9214214,burnham institute for medical research,ca,92037,8795531,"bode, lars"
1498546,2008,la jolla,united states,9214214,burnham institute for medical research,ca,92037,6209347,"lowe, john b"
1498549,2008,la jolla,united states,9214214,burnham institute for medical research,ca,92037,1893535,"von andrian, ulrich h"
1499820,2008,la jolla,united states,9214214,burnham institute for medical research,ca,92037,6767953,"gu, zezong"
1499821,2008,la jolla,united states,9214214,burnham institute for medical research,ca,92037,8775300,"riedl, stefan j"
1499822,2008,la jolla,united states,9214214,burnham institute for medical research,ca,92037,1864249,"masliah, eliezer"
1515340,2008,la jolla,united states,9214214,burnham institute for medical research,ca,92037,1883552,"muller-sieburg, christa e"
1517543,2008,la jolla,united states,9214214,burnham institute for medical research,ca,92037,1940901,"bradley, linda mac pherson"


Replace DUNS numbers from 2008 and earlier with current DUNS numbers.

In [162]:
before_2008 = pi_unique.ix[pi_unique['fy'] <= '2008']
before_2008.shape
after_2008 = pi_unique.ix[pi_unique['fy'] > '2008']
after_2008.shape

(126611, 9)

(90854, 9)

In [186]:
from difflib import SequenceMatcher
def similar(str1, str2):
    return SequenceMatcher(None, str1, str2).ratio()
similar('burnham institute for medical research', 'sanford-burnham medical research instit')

0.6233766233766234

In [190]:
names_after_2008 = after_2008['org_name'].dropna().unique()
names_before_2008 = before_2008['org_name'].dropna().unique()
names_after_2008.shape
names_before_2008.shape

(9219,)

(9926,)

In [191]:
similarity = {}
for i in range(len(names_after_2008)):
    str1 = names_after_2008[i]
    for j in range(len(names_before_2008)):
        str2 = names_before_2008[j]
        sim_ratio = similar(str1, str2)
        if sim_ratio >= 0.5:
            similarity[str1] = (str2, sim_ratio)

In [192]:
similarity

{'iq solutions, inc': ('celsense, inc', 0.5333333333333333),
 'university of massachusetts amherst': ('university legal services', 0.5),
 'fox chase cancer center': ('oregon advocacy center', 0.5777777777777777),
 'duke university': ('university legal services', 0.5),
 'centre for chronic disease control': ('advocacy center for persons w/ disabilit',
  0.5135135135135135),
 'investigaciones medicas en salud': ('xcision medical systems, llc', 0.5),
 'mu-jhu care': ('mu-jhu care', 1.0),
 'universidad nacional mayor de san marcos': ('traditional &modern health practioners',
  0.5128205128205128),
 'h. lee moffitt cancer ctr &res inst': ('cancer research uk cambridge res inst',
  0.5),
 'oregon health &science university': ('oregon advocacy center',
  0.509090909090909),
 'university of california at davis': ('university legal services',
  0.5862068965517241),
 'icahn school of medicine at mount sinai': ('faculty of medicine of timone',
  0.5588235294117647),
 'dana-farber cancer inst': ('

### DUNS numbers of different lengths
DUNS numbers should be 9 digits.

In [146]:
pi_unique['org_duns'].str.len().value_counts()

9.0     139318
8.0      26370
7.0       7115
20.0       369
19.0        59
Name: org_duns, dtype: int64

In [147]:
pi_unique.ix[pi_unique['org_duns'].str.len() == 8.0]

Unnamed: 0,fy,org_city,org_country,org_duns,org_name,org_state,org_zipcode,pi_ids,pi_names
1225532,2004,kampala,uganda,56553705,protecting families against aids,,,9100440,"serukka, david"
1307513,2005,johannesburg,south africa,b6213-g1,soul city,,,9023030,"japhet, garth"
1397149,2007,greensboro,united states,11247255,moses cone-wesley long cmty health fdn,nc,274087019,9064574,"coates, george m"
1397431,2007,wenatchee,united states,03480039,chelan-douglas counties together for dfy,wa,98801,9065902,"hunter, renee"
1397510,2007,amityville,united states,06574026,amityville union free school district,ny,117013161,9066302,"taliercio, maria"
1397725,2007,warwick,united states,79037218,chariho tri-town task force,ri,02889,9162324,"augustinho, paula"
1397736,2007,mercer island,united states,05630797,"city of mercer island, washington",wa,98040,9437722,"franklin, derek j"
1397776,2007,spencer,united states,79784474,"positively spencer youth, inc",ia,51301,9162476,"wilkerson, rick"
1486486,2008,greensboro,united states,11247255,moses cone-wesley long cmty health fdn,nc,274087019,10238893,"nussbaum, victor"
1486755,2008,amityville,united states,06574026,amityville union free school district,ny,117013161,10630762,"velez, ray"


In [148]:
pi_unique.ix[pi_unique['org_duns'].str.len() == 7.0]

Unnamed: 0,fy,org_city,org_country,org_duns,org_name,org_state,org_zipcode,pi_ids,pi_names
1668444,2010,salt lake city,united states,9095365,university of utah,ut,841128930,8601681,"parks, thomas"
1668446,2010,hampton,united states,3135068,hampton university,va,236680108,8206462,"harvey, william r"
1668452,2010,chicago,united states,5421136,university of chicago,il,606375418,8596104,"hopkins, michael d"
1668472,2010,university park,united states,3403953,pennsylvania state university-univ park,pa,168027000,10378392,"foley, henry c"
1668552,2010,nashville,united states,4413456,vanderbilt university,tn,372122809,7053397,"fesik, stephen w"
1668561,2010,rochester,united states,6471700,mayo clinic rochester,mn,559050001,6377890,"nelson, timothy james"
1668564,2010,cambridge,united states,1425594,massachusetts institute of technology,ma,02142,8728398,"nolan, elizabeth m"
1668566,2010,stanford,united states,9214214,stanford university,ca,943041222,7541501,"dunn, alexander r"
1668571,2010,cambridge,united states,1425594,massachusetts institute of technology,ma,02142,1872180,"niles, jacquin c"
1668577,2010,university park,united states,3403953,pennsylvania state university-univ park,pa,168027000,9493483,"huang, tony jun"


It seems like most of the 8 and 7 digit DUNS numbers are missing leading zeros. Assume this is the case and add leading zeros to DUNS numbers.

In [99]:
def add_zero_duns(df, col1 = 'org_duns'):
    #original = []
    actual = []
    series = df[col1].tolist()
    for num in series:
        #original.append(num)
        if type(num) == float:
            pass
        else:
            while len(num) < 9.0:
                num = '0' + num
        actual.append(num)
    df[col1 + '_'] = actual
    return df

pi_unique = add_zero_duns(pi_unique)

In [100]:
del pi_unique['org_duns']
pi_unique = pi_unique.rename(columns = {'org_duns_':'org_duns'})

Re-organize columns.

In [101]:
old_cols = pi_unique.columns.tolist()
new_cols = old_cols[0:1] + old_cols[-3:-1] + old_cols[-1:] + old_cols[3:4] + old_cols[1:2] + old_cols[4:5] + old_cols[5:6] + old_cols[2:3]
pi_unique = pi_unique[new_cols]
pi_unique.head()

Unnamed: 0,fy,pi_ids,pi_names,org_duns,org_name,org_city,org_state,org_zipcode,org_country
840226,2000,1860776,"bunn, paul a",65391526.0,university of colorado denver,aurora,co,800452505,united states
840227,2000,6423558,"lecca, pedro j",56282296.0,howard university,washington,dc,20059,united states
840228,2000,1871945,"krugman, richard d",65391526.0,university of colorado denver,aurora,co,800452505,united states
840229,2000,6522067,"pell, eva j",,pennsylvania state university-univ park,university park,pa,16802,united states
840230,2000,8756854,"logan, timothy m",20520466.0,florida state university,tallahassee,fl,323064166,united states


According to NIH's description, multiple DUNS are separated by a semi-colon. There are only two groups where multiple DUNS are listed, those with 19 characters and those with 20 characters.

In [102]:
pi_unique.ix[pi_unique['org_duns'].str.len() == 20.0]

Unnamed: 0,fy,pi_ids,pi_names,org_duns,org_name,org_city,org_state,org_zipcode,org_country
1846017,2012,9470515,"pacholczyk, rafal wojciech",809593387; 966668691,georgia regents university,augusta,ga,309120004,united states
1846028,2012,6491945,"dong, zheng",809593387; 966668691,georgia regents university,augusta,ga,309120004,united states
1846522,2012,10353675,"loria, analia",809593387; 966668691,georgia regents university,augusta,ga,309120004,united states
1846817,2012,2091767,"mellor, andrew lee",809593387; 966668691,georgia regents university,augusta,ga,309120004,united states
1847006,2012,10645764,"price, roderick",609980727; 962545658,texas tech university health scis center,lubbock,tx,794306271,united states
1847293,2012,10662293,"bollinger, kathryn",809593387; 966668691,georgia regents university,augusta,ga,309120004,united states
1847807,2012,1866163,"isales, carlos miguel",809593387; 966668691,georgia regents university,augusta,ga,309120004,united states
1848338,2012,10341194,"conery, john",079289626; 948117312,university of oregon,eugene,or,974035219,united states
1848339,2012,1882582,"yu, robert k",809593387; 966668691,georgia regents university,augusta,ga,309120004,united states
1849369,2012,6772735,"dhandapani, krishnan m",809593387; 966668691,georgia regents university,augusta,ga,309120004,united states


In [103]:
pi_unique.ix[pi_unique['org_duns'].str.len() == 19.0]

Unnamed: 0,fy,pi_ids,pi_names,org_duns,org_name,org_city,org_state,org_zipcode,org_country
1669155,2010,9437629.0,"risher, william christopher",809593387;966668691,georgia regents university,augusta,ga,309124810.0,united states
1669167,2010,9699188.0,"kelly-cobbs, aisha imani",809593387;966668691,georgia regents university,augusta,ga,309124810.0,united states
1669753,2010,10123038.0,"rafikova, olga",809593387;966668691,georgia regents university,augusta,ga,309124810.0,united states
1671078,2010,8815155.0,"wu, wei-hua e",809593387;966668691,georgia regents university,augusta,ga,309124810.0,united states
1671793,2010,6225753.0,"kohan, donald e",809593387;966668691,georgia regents university,augusta,ga,309124810.0,united states
1672451,2010,2275110.0,"bieberich, erhard",809593387;966668691,georgia regents university,augusta,ga,309124810.0,united states
1672485,2010,9360980.0,"kim, jimok",809593387;966668691,georgia regents university,augusta,ga,309124810.0,united states
1673577,2010,7726328.0,"rempala, grzegorz a",809593387;966668691,georgia regents university,augusta,ga,309124810.0,united states
1674423,2010,1861006.0,"ganapathy, vadivel",809593387;966668691,georgia regents university,augusta,ga,309124810.0,united states
1675135,2010,2654991.0,"didion, sean p",809593387;966668691,georgia regents university,augusta,ga,309124810.0,united states


For organizations where two DUNS are listed, it seems that both DUNS are used to identify the organization. The only difference between a DUNS with 20 characters and 19 characters is a space after the ';'. Add a space to the 19-length DUNS and replace these values in the df.

In [104]:
#Get the DUNS numbers from pi_unique where the length of the DUNS is 19
len19 = pi_unique.ix[pi_unique['org_duns'].str.len() == 19.0]['org_duns'].values.tolist()

#add a space after the semi-colon
len19_new = []
for i in range(len(len19)):
    len19_new.append(len19[i][:10] + ' ' + len19[i][10:])

#replace the original DUNS of 19 characters
pi_unique['org_duns'] = pi_unique['org_duns'].replace(len19, len19_new)

Remove duplicates by pi_id and org_duns, as a PI can move between institutions.

In [105]:
pi_unique.shape
pi_unique = pi_unique.drop_duplicates('pi_ids org_duns'.split())
pi_unique.shape

(217465, 9)

(216031, 9)

## Examining PIs from non-US countries

In [106]:
pd.set_option('max_rows', 1000)
pi_unique.org_country.value_counts()

united states     211016
canada               682
united kingdom       401
south africa         308
australia            224
switzerland          146
uganda               114
china                103
kenya                 99
ethiopia              88
india                 88
zambia                81
israel                77
tanzania u rep        76
france                74
germany               74
brazil                65
netherlands           59
sweden                56
thailand              49
zimbabwe              49
nigeria               48
peru                  48
argentina             40
malawi                40
mozambique            39
vietnam               39
rwanda                38
cote d'ivoire         34
denmark               34
italy                 33
mexico                31
haiti                 30
botswana              29
mali                  27
fed micronesia        22
spain                 21
colombia              21
belgium               20
bangladesh            19


Although most PIs are from the US, there are a significant number not from the US, and some countries are not listed.

In [107]:
pi_unique.ix[pi_unique['org_country'] != 'united states'].count()

fy             5015
pi_ids         5004
pi_names       4790
org_duns       2874
org_name       4858
org_city       3997
org_state       655
org_zipcode    2447
org_country    4008
dtype: int64

In [108]:
null_country = pi_unique.ix[pi_unique['org_country'].isnull()]
null_country['org_name'].value_counts(dropna = False)

NaN                                                      156
basic sciences                                           152
niaid extramural activities                               53
heart, lung, and blood institute                          45
environmental health sciences                             39
clinical center                                           36
child health and human development                        33
translational science                                     29
diabetes, digestive, kidney diseases                      25
national eye institute                                    25
cancer epidemiology and genetics                          25
neurological disorders and stroke                         23
national institute of mental health                       21
human genome research                                     21
national institute on drug abuse                          19
national cancer institute                                 18
aging                   

In [112]:
null_country.ix[null_country['org_name'].isnull()]

Unnamed: 0,fy,pi_ids,pi_names,org_duns,org_name,org_city,org_state,org_zipcode,org_country
1495798,2008,,,85341774.0,,,,,
1590927,2009,10221863.0,"x, x",132220443.0,,,,,
1667723,2009,11595713.0,"deborah, bitner",,,,,,
1761771,2010,10690514.0,"crist, keith",,,,,,
1761806,2010,10496126.0,"schwingl, pam",,,,,,
1845667,2011,11173671.0,"kummins, mara",,,,,,
1910703,2012,11590222.0,"mcconathy, walter",,,,,,
1925615,2013,12062832.0,"allen, erin",,,,,,
1925799,2013,12062685.0,"bluestone, jeffrey",,,,,,
1927246,2013,12068559.0,"dua, sonia",,,,,,


In [111]:
null_country.ix[null_country['org_duns'].isnull()].count()

fy             938
pi_ids         938
pi_names       938
org_duns         0
org_name       848
org_city         0
org_state        0
org_zipcode      0
org_country      0
dtype: int64

## Adding information for PIs at the NIH

NIH city, state, country and zipcode information is not listed (listed as NaN). Add these as bethesda, md, united states and 20892, respectively. **Note:** This address may not be exact, as some NIH institutes/centers may be located elsewhere, but this information will represent general NIH information.

In addition to NIH institutes, there are 20 entries with a PI ID where no information is listed.

In [88]:
pi_unique = add_nih_info(pi_unique, col = 'org_city', replace_with = 'bethesda')
pi_unique = add_nih_info(pi_unique, col = 'org_state', replace_with = 'md')
pi_unique = add_nih_info(pi_unique, col = 'org_zipcode', replace_with = '20892')
pi_unique = add_nih_info(pi_unique, col = 'org_country', replace_with = 'united states')

In [89]:
pi_unique.head(20)

Unnamed: 0,pi_ids,pi_names,org_name,org_city,org_state,org_country,org_zipcode,org_city_copy,org_state_copy,org_zipcode_copy,org_country_copy
0,10799126,"scott, stuart alexander",icahn school of medicine at mount sinai,new york,ny,united states,100296574,,,,
1,7017365,"polster, brian m",university of maryland baltimore,baltimore,md,united states,212011508,,,,
2,1862210,"zucker, robert alpert",university of michigan,ann arbor,mi,united states,481091276,,,,
5,1866930,"so, peter t",massachusetts institute of technology,cambridge,ma,united states,21421029,,,,
6,9851446,"eden, uri tzvi; frank, loren m; ganguli, surya...",cold spring harbor laboratory,cold spring harbor,ny,united states,117242209,,,,
6,12280974,"eden, uri tzvi; frank, loren m; ganguli, surya...",cold spring harbor laboratory,cold spring harbor,ny,united states,117242209,,,,
6,10329759,"eden, uri tzvi; frank, loren m; ganguli, surya...",cold spring harbor laboratory,cold spring harbor,ny,united states,117242209,,,,
6,12572655,"eden, uri tzvi; frank, loren m; ganguli, surya...",cold spring harbor laboratory,cold spring harbor,ny,united states,117242209,,,,
6,11706552,"eden, uri tzvi; frank, loren m; ganguli, surya...",cold spring harbor laboratory,cold spring harbor,ny,united states,117242209,,,,
7,11044822,"gade, terence p",university of pennsylvania,philadelphia,pa,united states,191046205,,,,


In [90]:
to_keep = [col for col in pi_unique.columns if '_copy' not in col]
pi_unique = pi_unique[to_keep]
#pi_unique.filter(regex = '_copy')

## Final information 

In [91]:
pi_unique.head(20)

Unnamed: 0,pi_ids,pi_names,org_name,org_city,org_state,org_country,org_zipcode
0,10799126,"scott, stuart alexander",icahn school of medicine at mount sinai,new york,ny,united states,100296574
1,7017365,"polster, brian m",university of maryland baltimore,baltimore,md,united states,212011508
2,1862210,"zucker, robert alpert",university of michigan,ann arbor,mi,united states,481091276
5,1866930,"so, peter t",massachusetts institute of technology,cambridge,ma,united states,21421029
6,9851446,"eden, uri tzvi; frank, loren m; ganguli, surya...",cold spring harbor laboratory,cold spring harbor,ny,united states,117242209
6,12280974,"eden, uri tzvi; frank, loren m; ganguli, surya...",cold spring harbor laboratory,cold spring harbor,ny,united states,117242209
6,10329759,"eden, uri tzvi; frank, loren m; ganguli, surya...",cold spring harbor laboratory,cold spring harbor,ny,united states,117242209
6,12572655,"eden, uri tzvi; frank, loren m; ganguli, surya...",cold spring harbor laboratory,cold spring harbor,ny,united states,117242209
6,11706552,"eden, uri tzvi; frank, loren m; ganguli, surya...",cold spring harbor laboratory,cold spring harbor,ny,united states,117242209
7,11044822,"gade, terence p",university of pennsylvania,philadelphia,pa,united states,191046205


In [92]:
pi_unique.to_csv('pi_info.csv', index = False, compression = 'gzip')