# Cleaning PI Info
In order to prevent duplicate information used to train a model, store PI information separately from list of features.

**Eventually store as SQL database**

In [3]:
import pandas as pd
import numpy as np
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import cleaning_strings as cln
import importlib as imp
imp.reload(cln);

## Local functions 

In [4]:
def add_zero_zip(df, length, col1 = 'org_zipcode', col2 = 'org_country', country = 'united states'):
    
    '''Add leading zero to New England zipcodes.'''
    
    original = []
    actual = []
    for num in df.ix[(df[col1].str.len() == length) & (df[col2].str.contains(country))][col1]:
        original.append(num)
        num = '0' + num
        actual.append(num)
    return original, actual

def add_zero_duns(df, col1 = 'org_duns'):
    
    '''Add leading zeros to DUNS numbers until DUNS number is 9 digits.'''
    
    actual = []
    series = df[col1].tolist()
    for num in series:
        if type(num) == float:
            pass
        else:
            while len(num) < 9.0:
                num = '0' + num
        actual.append(num)
    df[col1 + '_'] = actual
    return df

def replace_nih_info(df, dict_, org_name = 'org_name'):
    
    '''Some NIH centers have updated names.
    Replace the appropriate grants with this information.'''
    
    for k, v in dict_.items():
        df.replace({k:v}, inplace = True)
        df.ix[df[org_name] == v] = df.ix[df[org_name] == v].fillna(method = 'bfill').fillna(method = 'ffill')
    return df

def add_nih_info(df, col, replace_with, org_name = 'org_name', org_country = 'org_country'):
    
    '''Add NIH information.
    NIH investigators are those whose organization is NOT null, but whose country is null.
    
    A note on replacing DUNS numbers:
    Each separate institute should have an individual DUNS number,
    therefore replace_with should be a dictionary where
    key:value is org_name:(dummy) org_duns'''
    
    if col == 'org_duns':
        df_replace = []
        for k, v in replace_with.items():
            df_fill = pd.DataFrame(df.ix[df[org_name] == k][col].fillna(v))
            df_replace.append(df_fill)
        df_replace = pd.concat(df_replace)
    
    else:
        df_replace = pd.DataFrame(df.ix[~df[org_name].isnull() & df[org_country].isnull()]\
                                  [col].fillna(replace_with))
    df_merged = pd.merge(df, df_replace, how = 'left', left_index=True, right_index=True, suffixes=('', '_copy'))
    df_merged[col].fillna(df_merged[col + '_copy'], inplace = True)
    return df_merged

## Cleaning PI information
We want a dataframe where each row is a single PI (no duplicates) and associated organization information as a cross-reference to the grants data.

Import relevant columns from csv with raw grant information.

In [5]:
columns = 'fy pi_ids pi_names org_name org_city org_state org_country org_zipcode org_duns'.split()
dtypes = {key: str for key in columns}

In [6]:
pi_info_raw = pd.read_csv('all_grants.csv', compression = 'gzip', usecols = columns, dtype = dtypes)

#Only analyzing grants from 2000 onwards, as prior to that no funding information is available
pi_info_raw = pi_info_raw.ix[pi_info_raw['fy'] >= '2000']
pi_info_raw.tail()

Unnamed: 0,fy,org_city,org_country,org_duns,org_name,org_state,org_zipcode,pi_ids,pi_names
2223287,2016,lawrence,united states,76248616,university of kansas lawrence,ks,660457568,9524770;,"mcgill, jodi l.;"
2223288,2016,davis,united states,47120084,university of california at davis,ca,956186153,6490459;,"clancy, colleen e;"
2223289,2016,la jolla,united states,804355790,university of california san diego,ca,920930934,1901669;,"feng, gen-sheng ;"
2223290,2016,coral gables,united states,52780918,university of miami school of medicine,fl,331462926,10944221;,"sharifai, nima ;"
2223291,2016,toledo,united states,51623734,university of toledo,oh,436063390,9288457;,"liu, song-tao ;"


In [7]:
pi_info_cleaned = cln.strip_df(pi_info_raw, ' ', ';', ' ', '.')
pi_info_cleaned.tail()

Unnamed: 0,fy,org_city,org_country,org_duns,org_name,org_state,org_zipcode,pi_ids,pi_names
2223287,2016,lawrence,united states,76248616,university of kansas lawrence,ks,660457568,9524770,"mcgill, jodi l"
2223288,2016,davis,united states,47120084,university of california at davis,ca,956186153,6490459,"clancy, colleen e"
2223289,2016,la jolla,united states,804355790,university of california san diego,ca,920930934,1901669,"feng, gen-sheng"
2223290,2016,coral gables,united states,52780918,university of miami school of medicine,fl,331462926,10944221,"sharifai, nima"
2223291,2016,toledo,united states,51623734,university of toledo,oh,436063390,9288457,"liu, song-tao"


In [8]:
pi_info_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 653801 entries, 1569491 to 2223291
Data columns (total 9 columns):
fy             653801 non-null object
org_city       625577 non-null object
org_country    625632 non-null object
org_duns       611923 non-null object
org_name       653128 non-null object
org_state      617661 non-null object
org_zipcode    622758 non-null object
pi_ids         653801 non-null object
pi_names       653801 non-null object
dtypes: object(9)
memory usage: 49.9+ MB


## Splitting PI Info
Some grants have multiple PIs listed on the grant, and the information for the group of PIs is listed only as the contact PI's information. In order to get unique PI information, these PI groups must be separated into individual PIs.

Split pi_info_cleaned into two dataframes, one containing grouped (multiple) PIs and one containing single PIs.

In [9]:
multi_pi_unsplit = pi_info_cleaned.ix[pi_info_cleaned['pi_ids'].str.contains('contact', na = False)]
pi_info = pi_info_cleaned.drop(multi_pi_unsplit.index)

Drop duplicates to obtain unique PIs that were awarded solo grants. Check both PI IDs and organization name to check for PIs that moved to a different institution.

In [10]:
pi_info.shape
pi_unique = pi_info.drop_duplicates(['pi_ids', 'org_duns'])
pi_unique.shape
pi_unique.tail()

(606233, 9)

(158444, 9)

Unnamed: 0,fy,org_city,org_country,org_duns,org_name,org_state,org_zipcode,pi_ids,pi_names
2223248,2016,champaign,united states,41544081,university of illinois at urbana-champaign,il,618207473,2084901,"bolton, eric c"
2223251,2016,davis,united states,47120084,university of california at davis,ca,956186153,9856365,"juliano, celina"
2223255,2016,seattle,united states,605799469,university of washington,wa,981959472,11678618,"gerner, michael"
2223258,2016,omaha,united states,168559177,university of nebraska medical center,ne,681987835,1873357,"sanderson, sam d"
2223273,2016,hartford,united states,807853791,connecticut state dept of public health,ct,61061367,14753985,"gonsalves, lou"


Split multiple PIs on PI ID.

In [11]:
multi_pi = cln.split_rows(multi_pi_unsplit, 'pi_ids', by = ';')
multi_pi.tail()

Unnamed: 0,fy,org_city,org_country,org_duns,org_name,org_state,org_zipcode,pi_names,pi_ids
2223041,2016,atlanta,united states,66469933,emory university,ga,303224250,"jovanovic, tanja ; smith, alicia k. (contact)",8800131 (contact)
2223047,2016,new york,united states,41968306,new york university,ny,100122300,"pyle, anna marie; schlick, tamar (contact)",1885894
2223047,2016,new york,united states,41968306,new york university,ny,100122300,"pyle, anna marie; schlick, tamar (contact)",2414701 (contact)
2223241,2016,baltimore,united states,188435911,university of maryland baltimore,md,212011508,"kao, joseph pao yung; mayer, dirk (contact)",8080184
2223241,2016,baltimore,united states,188435911,university of maryland baltimore,md,212011508,"kao, joseph pao yung; mayer, dirk (contact)",9340781 (contact)


Create a df where the '(contact)' string has been stripped from pi_ids; this will allow identification of unique PI IDs (otherwise an ID with '(contact)' appended at the end is viewed as a unique string).

In [12]:
multi_stripped = multi_pi.copy()
multi_stripped['pi_ids'] = multi_stripped['pi_ids'].str.strip(' (contact)')
multi_stripped.tail()

Unnamed: 0,fy,org_city,org_country,org_duns,org_name,org_state,org_zipcode,pi_names,pi_ids
2223041,2016,atlanta,united states,66469933,emory university,ga,303224250,"jovanovic, tanja ; smith, alicia k. (contact)",8800131
2223047,2016,new york,united states,41968306,new york university,ny,100122300,"pyle, anna marie; schlick, tamar (contact)",1885894
2223047,2016,new york,united states,41968306,new york university,ny,100122300,"pyle, anna marie; schlick, tamar (contact)",2414701
2223241,2016,baltimore,united states,188435911,university of maryland baltimore,md,212011508,"kao, joseph pao yung; mayer, dirk (contact)",8080184
2223241,2016,baltimore,united states,188435911,university of maryland baltimore,md,212011508,"kao, joseph pao yung; mayer, dirk (contact)",9340781


Create columns in split multi_pi df denoting whether the PI is the contact and whether the PI ID is already present in the df pi_unique (which contains all PIs that are solo authors of a grant). If the PI is already in pi_unique, we do not need to separate their information again.

In [13]:
multi_pi['contact'] = multi_pi['pi_ids'].str.contains('contact', na = False)
in_pi_unique = multi_stripped['pi_ids'].isin(pi_unique['pi_ids'])
multi_pi['unique_pi'] = in_pi_unique
multi_pi.tail()

Unnamed: 0,fy,org_city,org_country,org_duns,org_name,org_state,org_zipcode,pi_names,pi_ids,contact,unique_pi
2223041,2016,atlanta,united states,66469933,emory university,ga,303224250,"jovanovic, tanja ; smith, alicia k. (contact)",8800131 (contact),True,True
2223047,2016,new york,united states,41968306,new york university,ny,100122300,"pyle, anna marie; schlick, tamar (contact)",1885894,False,True
2223047,2016,new york,united states,41968306,new york university,ny,100122300,"pyle, anna marie; schlick, tamar (contact)",2414701 (contact),True,True
2223241,2016,baltimore,united states,188435911,university of maryland baltimore,md,212011508,"kao, joseph pao yung; mayer, dirk (contact)",8080184,False,True
2223241,2016,baltimore,united states,188435911,university of maryland baltimore,md,212011508,"kao, joseph pao yung; mayer, dirk (contact)",9340781 (contact),True,True


One PI is always designated as a contact if there are multiple PIs listed on a grant. If the PI's ID was not in the df pi_unique, but is listed as a contact (that is, unique_pi == False but contact == True), then we can isolate their information and add this information to pi_unique.

In [14]:
unique_contact = multi_pi[(multi_pi['unique_pi'] == False) & (multi_pi['contact'] == True)]
unique_contact = cln.split_rows(unique_contact, 'pi_names', ';')
unique_contact = unique_contact[unique_contact['pi_names'].str.contains('contact')]
unique_contact = cln.strip_series(unique_contact, ['pi_ids', 'pi_names'])

#shape before dropping duplicates
unique_contact.shape
unique_contact = unique_contact.drop_duplicates('pi_ids org_duns'.split())

#shape after dropping duplicates
unique_contact.shape
unique_contact.tail()

(9196, 11)

(4228, 11)

Unnamed: 0,fy,org_city,org_country,org_duns,org_name,org_state,org_zipcode,pi_ids,contact,unique_pi,pi_names
2221202,2016,tucson,united states,79416826,"synactix pharmaceuticals, inc",az,857182014,10738456,True,False,"li, hong-yu"
2221550,2016,minneapolis,united states,555917996,university of minnesota,mn,554552070,6169950,True,False,"peterson, kevin arthur"
2221747,2016,rochester,united states,6471700,mayo clinic rochester,mn,559050001,9965561,True,False,"pereira, naveen luke"
2221778,2016,baltimore,united states,78748558,"elixirgen, llc",md,212051511,11903165,True,False,"mano, tomokazu"
2222666,2016,boston,united states,73130411,massachusetts general hospital,ma,21142696,1902302,True,False,"cahill, daniel p"


If the PI ID is neither listed in unique_pi nor are they ever listed as a contact, then we cannot identify whether their organization information is actually different from the contact PI's information. These names will therefore not be split (the IDs are already split).

**Note:** When doing analysis from multiple years, PI IDs should be cross-referenced across years in case a PI did have a solo grant in one year but not in others.

In [15]:
not_contact = multi_pi[(multi_pi['unique_pi'] == False) & (multi_pi['contact'] == False)]
not_contact = not_contact.drop_duplicates('pi_ids org_duns'.split())
not_contact.shape
not_contact.tail()

(7949, 11)

Unnamed: 0,fy,org_city,org_country,org_duns,org_name,org_state,org_zipcode,pi_names,pi_ids,contact,unique_pi
2222255,2016,aurora,united states,41096314,university of colorado denver,co,800452571,"d'sa, viren ; deoni, sean cl (contact); hobbin...",12291424,False,False
2222255,2016,aurora,united states,41096314,university of colorado denver,co,800452571,"d'sa, viren ; deoni, sean cl (contact); hobbin...",6667032,False,False
2222255,2016,aurora,united states,41096314,university of colorado denver,co,800452571,"d'sa, viren ; deoni, sean cl (contact); hobbin...",6870571,False,False
2222458,2016,providence,united states,1785542,brown university,ri,29129002,"chan, philip andrew; mena, leandro antonio; nu...",8556604,False,False
2222666,2016,boston,united states,73130411,massachusetts general hospital,ma,21142696,"brastianos, priscilla kaliopi; cahill, daniel ...",11213017,False,False


Join all dfs containing unique PI ids.

In [16]:
#Reorder columns so joins can be performed correctly

#unique PIs with a solo grant
cols1 = unique_contact.columns.tolist()
cols1 = cols1[-4:-3] + cols1[-1:] + cols1[0:5]
unique_1 = unique_contact[cols1]

#unique PIs that neither have a solor grant nor are listed as a contact
cols2 = not_contact.columns.tolist()
cols2 = cols2[-3:-2] + cols2[0:6]
unique_2 = not_contact[cols2]

#unique PIs that do not have solo grants but have a
unique_multi = unique_1.append(unique_2)
unique_multi = unique_multi.drop_duplicates('pi_ids org_name'.split())
unique_multi.shape

pi_unique.shape
pi_unique = pi_unique.append(unique_multi)

pi_unique = cln.strip_series(pi_unique, ['pi_ids'], strip = ' ')
pi_unique = pi_unique.drop_duplicates('pi_ids org_name'.split())
pi_unique.shape

(10874, 8)

(158444, 9)

(133121, 9)

## Fixing zipcodes
The leading zero of New England zipcodes was dropped in the raw data. Add leading zero to zipcodes from the US that are length 8 or 4.

In [17]:
pi_unique['org_zipcode'].str.len().value_counts()

9.0     84432
5.0     27251
8.0      7662
7.0       669
4.0       531
6.0       185
3.0       108
1.0        53
10.0       11
2.0        10
Name: org_zipcode, dtype: int64

In [18]:
zip_8, zip_9 = add_zero_zip(pi_unique, 8.0)
zip_4, zip_5 = add_zero_zip(pi_unique, 4.0)

In [19]:
pi_unique['org_zipcode'] = pi_unique['org_zipcode'].replace(zip_8, zip_9)
pi_unique['org_zipcode'] = pi_unique['org_zipcode'].replace(zip_4, zip_5)

In [20]:
pi_unique['org_zipcode'].str.len().value_counts()

9.0     92022
5.0     27383
7.0       669
4.0       399
6.0       185
3.0       108
8.0        72
1.0        53
10.0       11
2.0        10
Name: org_zipcode, dtype: int64

## DUNS numbers
The DUNS number should be a 9-digit number that uniquely identifies an organization. However, the DUNS numbers of organizations changed between 2008 to 2009.

In [154]:
pi_unique.ix[(pi_unique['org_name'].str.contains('stanford', na = False)) & ((pi_unique['fy'] == '2008') | (pi_unique['fy'] == '2009'))]

Unnamed: 0,fy,org_city,org_country,org_duns,org_name,org_state,org_zipcode,pi_ids,pi_names
1482486,2008,stanford,united states,800771545,stanford university,ca,943056203.0,7569301,"kesler, shelli r"
1482494,2008,stanford,united states,800771545,stanford university,ca,943056203.0,3121111,"bryant, zev"
1482881,2008,stanford,united states,800771545,stanford university,ca,943056203.0,9167595,"chang, catherine elizabeth"
1482892,2008,stanford,united states,800771545,stanford university,ca,943056203.0,9298617,"samanez larkin, gregory r"
1482897,2008,stanford,united states,800771545,stanford university,ca,943056203.0,9294648,"banko, max ryan"
1483451,2008,stanford,united states,800771545,stanford university,ca,943056203.0,8470072,"brown, justin e"
1483461,2008,stanford,united states,800771545,stanford university,ca,943056203.0,9155238,"hernandez, armando ricardo"
1483463,2008,stanford,united states,800771545,stanford university,ca,943056203.0,9201684,"pina, francisco javier"
1483505,2008,stanford,united states,800771545,stanford university,ca,943056203.0,7028019,"esparza, lourdes adriana"
1483675,2008,stanford,united states,800771545,stanford university,ca,943056203.0,8822430,"burrows, brittany"


In [156]:
pi_unique.ix[pi_unique['org_duns'] == '009214214']['org_name'].unique()

array(['burnham institute for medical research', 'stanford university'], dtype=object)

The DUNS number is also not unique between 2008 and 2009.

In [159]:
pi_unique.ix[(pi_unique['org_name'].str.contains('burnham', na = False)) & ((pi_unique['fy'] == '2008') | (pi_unique['fy'] == '2009'))]

Unnamed: 0,fy,org_city,org_country,org_duns,org_name,org_state,org_zipcode,pi_ids,pi_names
1482883,2008,la jolla,united states,9214214,burnham institute for medical research,ca,92037,9168920,"opaluch, amanda"
1484399,2008,la jolla,united states,9214214,burnham institute for medical research,ca,92037,8967204,"miletic sedy, ana"
1492092,2008,la jolla,united states,9214214,burnham institute for medical research,ca,92037,8795531,"bode, lars"
1498546,2008,la jolla,united states,9214214,burnham institute for medical research,ca,92037,6209347,"lowe, john b"
1498549,2008,la jolla,united states,9214214,burnham institute for medical research,ca,92037,1893535,"von andrian, ulrich h"
1499820,2008,la jolla,united states,9214214,burnham institute for medical research,ca,92037,6767953,"gu, zezong"
1499821,2008,la jolla,united states,9214214,burnham institute for medical research,ca,92037,8775300,"riedl, stefan j"
1499822,2008,la jolla,united states,9214214,burnham institute for medical research,ca,92037,1864249,"masliah, eliezer"
1515340,2008,la jolla,united states,9214214,burnham institute for medical research,ca,92037,1883552,"muller-sieburg, christa e"
1517543,2008,la jolla,united states,9214214,burnham institute for medical research,ca,92037,1940901,"bradley, linda mac pherson"


Replace DUNS numbers from 2008 and earlier with current DUNS numbers.

In [162]:
before_2008 = pi_unique.ix[pi_unique['fy'] <= '2008']
before_2008.shape
after_2008 = pi_unique.ix[pi_unique['fy'] > '2008']
after_2008.shape

(126611, 9)

(90854, 9)

In [186]:
from difflib import SequenceMatcher
def similar(str1, str2):
    return SequenceMatcher(None, str1, str2).ratio()
similar('burnham institute for medical research', 'sanford-burnham medical research instit')

0.6233766233766234

In [190]:
names_after_2008 = after_2008['org_name'].dropna().unique()
names_before_2008 = before_2008['org_name'].dropna().unique()
names_after_2008.shape
names_before_2008.shape

(9219,)

(9926,)

In [191]:
similarity = {}
for i in range(len(names_after_2008)):
    str1 = names_after_2008[i]
    for j in range(len(names_before_2008)):
        str2 = names_before_2008[j]
        sim_ratio = similar(str1, str2)
        if sim_ratio >= 0.5:
            similarity[str1] = (str2, sim_ratio)

In [194]:
similarity

{'iq solutions, inc': ('celsense, inc', 0.5333333333333333),
 'university of massachusetts amherst': ('university legal services', 0.5),
 'fox chase cancer center': ('oregon advocacy center', 0.5777777777777777),
 'duke university': ('university legal services', 0.5),
 'centre for chronic disease control': ('advocacy center for persons w/ disabilit',
  0.5135135135135135),
 'investigaciones medicas en salud': ('xcision medical systems, llc', 0.5),
 'mu-jhu care': ('mu-jhu care', 1.0),
 'universidad nacional mayor de san marcos': ('traditional &modern health practioners',
  0.5128205128205128),
 'h. lee moffitt cancer ctr &res inst': ('cancer research uk cambridge res inst',
  0.5),
 'oregon health &science university': ('oregon advocacy center',
  0.509090909090909),
 'university of california at davis': ('university legal services',
  0.5862068965517241),
 'icahn school of medicine at mount sinai': ('faculty of medicine of timone',
  0.5588235294117647),
 'dana-farber cancer inst': ('

Organization names and DUNS change between 2008 and 2009, and the DUNS are not unique between the two epochs. It will be difficult to cross reference each institution across years. For the first pass, we will only analyze grants from 2009 and later.

In [195]:
pi_unique = pi_unique.ix[pi_unique['fy'] > '2008']

### DUNS numbers of different lengths
DUNS numbers should be 9 digits.

In [21]:
pi_unique['org_duns'].str.len().value_counts()

9.0     89283
8.0     27878
7.0      7516
20.0      384
19.0       59
Name: org_duns, dtype: int64

In [22]:
pi_unique.ix[pi_unique['org_duns'].str.len() == 8.0].head()

Unnamed: 0,fy,org_city,org_country,org_duns,org_name,org_state,org_zipcode,pi_ids,pi_names
1571089,2009,atlanta,united states,13597081,georgia state dept of human resources,ga,303033181,9593571,"gibson, jevon"
1571494,2009,warwick,united states,79037218,chariho tri-town task force,ri,2889,11519539,"augustinho, paula"
1571511,2009,wenatchee,united states,3480039,chelan-douglas counties together for dfy,wa,98801,10159604,"hunter, renee"
1590698,2009,mountain view,united states,60274473,"ccs associates, inc",ca,940430816,10200029,"sigman, caroline"
1591160,2009,atlanta,united states,13597081,georgia state dept of human resources,ga,303033181,10374552,"brown, stewart"


In [23]:
pi_unique.ix[pi_unique['org_duns'].str.len() == 7.0].head()

Unnamed: 0,fy,org_city,org_country,org_duns,org_name,org_state,org_zipcode,pi_ids,pi_names
1668444,2010,salt lake city,united states,9095365,university of utah,ut,841128930,8601681,"parks, thomas"
1668446,2010,hampton,united states,3135068,hampton university,va,236680108,8206462,"harvey, william r"
1668452,2010,chicago,united states,5421136,university of chicago,il,606375418,8596104,"hopkins, michael d"
1668463,2010,pittsburgh,united states,4514360,university of pittsburgh at pittsburgh,pa,152132303,2434971,"levine, arthur s"
1668472,2010,university park,united states,3403953,pennsylvania state university-univ park,pa,168027000,10378392,"foley, henry c"


It seems like most of the 8 and 7 digit DUNS numbers are missing leading zeros. Assume this is the case and add leading zeros to DUNS numbers.

In [24]:
pi_unique = add_zero_duns(pi_unique)
del pi_unique['org_duns']
pi_unique = pi_unique.rename(columns = {'org_duns_':'org_duns'})

Re-organize columns.

In [25]:
old_cols = pi_unique.columns.tolist()
new_cols = old_cols[0:1] + old_cols[-3:-1] + old_cols[-1:] + old_cols[3:4] + old_cols[1:2] + old_cols[4:5] + old_cols[5:6] + old_cols[2:3]
pi_unique = pi_unique[new_cols]
pi_unique.head()

Unnamed: 0,fy,pi_ids,pi_names,org_duns,org_name,org_city,org_state,org_zipcode,org_country
1569491,2009,,",",,"iq solutions, inc",rockville,md,208523003,united states
1569492,2009,1859426.0,"park, no-hee",92530369.0,university of california los angeles,los angeles,ca,900952000,united states
1569493,2009,1864471.0,"forrest, john n",77470003.0,mount desert island biological lab,salsbury cove,me,4672,united states
1569494,2009,1889505.0,"macrina, francis l",105300446.0,virginia commonwealth university,richmond,va,232980568,united states
1569495,2009,1961084.0,"moore, holly marie",167204994.0,new york state psychiatric institute,new york,ny,10032,united states


According to NIH's description, multiple DUNS are separated by a semi-colon. There are only two groups where multiple DUNS are listed, those with 19 characters and those with 20 characters.

In [26]:
pi_unique.ix[pi_unique['org_duns'].str.len() == 20.0].head()

Unnamed: 0,fy,pi_ids,pi_names,org_duns,org_name,org_city,org_state,org_zipcode,org_country
1846017,2012,9470515,"pacholczyk, rafal wojciech",809593387; 966668691,georgia regents university,augusta,ga,309120004,united states
1846028,2012,6491945,"dong, zheng",809593387; 966668691,georgia regents university,augusta,ga,309120004,united states
1846522,2012,10353675,"loria, analia",809593387; 966668691,georgia regents university,augusta,ga,309120004,united states
1846817,2012,2091767,"mellor, andrew lee",809593387; 966668691,georgia regents university,augusta,ga,309120004,united states
1847006,2012,10645764,"price, roderick",609980727; 962545658,texas tech university health scis center,lubbock,tx,794306271,united states


In [27]:
pi_unique.ix[pi_unique['org_duns'].str.len() == 19.0].head()

Unnamed: 0,fy,pi_ids,pi_names,org_duns,org_name,org_city,org_state,org_zipcode,org_country
1669155,2010,9437629,"risher, william christopher",809593387;966668691,georgia regents university,augusta,ga,309124810,united states
1669167,2010,9699188,"kelly-cobbs, aisha imani",809593387;966668691,georgia regents university,augusta,ga,309124810,united states
1669753,2010,10123038,"rafikova, olga",809593387;966668691,georgia regents university,augusta,ga,309124810,united states
1671078,2010,8815155,"wu, wei-hua e",809593387;966668691,georgia regents university,augusta,ga,309124810,united states
1671793,2010,6225753,"kohan, donald e",809593387;966668691,georgia regents university,augusta,ga,309124810,united states


For organizations where two DUNS are listed, it seems that both DUNS are used to identify the organization. The only difference between a DUNS with 20 characters and 19 characters is a space after the ';'. Add a space to the 19-length DUNS and replace these values in the df.

In [28]:
#Get the DUNS numbers from pi_unique where the length of the DUNS is 19
len19 = pi_unique.ix[pi_unique['org_duns'].str.len() == 19.0]['org_duns'].values.tolist()

#add a space after the semi-colon
len19_new = []
for i in range(len(len19)):
    len19_new.append(len19[i][:10] + ' ' + len19[i][10:])

#replace the original DUNS of 19 characters
pi_unique['org_duns'] = pi_unique['org_duns'].replace(len19, len19_new)

Remove duplicates by pi_id and org_duns, as a PI can move between institutions.

In [29]:
pi_unique.shape
pi_unique = pi_unique.drop_duplicates('pi_ids org_duns'.split())
pi_unique.shape

(133121, 9)

(132772, 9)

## Examining PIs from non-US countries

In [30]:
pd.set_option('max_rows', 1000)
pi_unique.org_country.value_counts()

united states     127870
canada               392
south africa         249
united kingdom       226
switzerland          108
australia            108
uganda                97
kenya                 89
ethiopia              79
india                 68
china                 65
tanzania u rep        63
zambia                60
nigeria               47
israel                44
france                44
brazil                41
germany               39
peru                  37
netherlands           36
malawi                35
mozambique            35
rwanda                33
vietnam               32
thailand              31
argentina             31
zimbabwe              29
cote d'ivoire         29
sweden                28
botswana              26
haiti                 23
fed micronesia        19
denmark               19
italy                 16
mali                  16
bangladesh            16
pakistan              15
ghana                 15
namibia               14
cambodia              14


Although most PIs are from the US, there are a significant number not from the US, and some countries are not listed.

In [31]:
null_country = pi_unique.ix[pi_unique['org_country'].isnull()]
null_country['org_name'].value_counts()

#names of organizations where country is not listed
orgs_null_country = null_country['org_name'].unique()

#names of organizations with zipcode beginning with 20892 (NIH organizations)
nih_orgs = pi_unique.ix[pi_unique['org_zipcode'].str.contains('20892', na = False)]['org_name'].unique()

basic sciences                                           447
niaid extramural activities                              203
diabetes, digestive, kidney diseases                     142
child health and human development                       134
environmental health sciences                            121
heart, lung, and blood institute                         110
aging                                                     89
national institute of mental health                       78
clinical center                                           75
neurological disorders and stroke                         75
cancer epidemiology and genetics                          63
national eye institute                                    59
human genome research                                     57
national institute on drug abuse                          50
dental &craniofacial  research                            41
alcohol abuse and alcoholism                              30
arthritis, musculoskelet

The organizations with no country listed and only one entry are generally misspellings and/or have missing information that can be easily filled. Hard code these data manually.

In [32]:
#fill in missing entries for loyola university entry
pi_unique.loc[1947303].fillna({'org_city':'chicago', 'org_country':'united states'}, inplace = True)

#Danmarks Tekniske
pi_unique.loc[1845456].fillna({'org_duns':'311132885', 'org_city':'Kongens Lyngby', 
                                  'org_country':'Denmark'}, inplace = True)

#Defense Analyses, Inc
pi_unique.loc[1779779].fillna(pi_unique.loc[2172282, :], inplace = True)
pi_unique.loc[2172282].fillna(pi_unique.loc[1779779, :], inplace = True)
pi_unique.loc[2172282].replace({'institute for defense ana':'institute for defense analyses, inc'}, inplace = True)

#UCLA
pi_unique.loc[1845599].replace('california univ los angel', 'university of california los angeles', inplace = True)
pi_unique.loc[1845599].fillna(pi_unique.loc[1846510], inplace = True)

#IIT research
pi_unique.loc[1845463].replace('iit resch institute', 'iit research institute', inplace = True)
pi_unique.loc[1845463].fillna(pi_unique.loc[1667658, :], inplace = True)

#UCSF
rows = [1845453, 1860543]
for row in rows:
    pi_unique.loc[row].replace('california univ san franc', 'university of california, san francisco', inplace = True)
    pi_unique.loc[row].fillna(pi_unique.loc[1574842], inplace = True)

#Omnitec
pi_unique.ix[pi_unique['org_name'].str.contains('omnitec solutions', na = False)] = \
pi_unique.ix[pi_unique['org_name'].str.contains('omnitec solutions', na = False)]\
.fillna(method = 'ffill').fillna(method = 'bfill')

In [33]:
pi_unique.ix[pi_unique['org_country'].isnull()]['org_name'].value_counts(dropna = False)

basic sciences                                           447
niaid extramural activities                              203
NaN                                                      158
diabetes, digestive, kidney diseases                     142
child health and human development                       134
environmental health sciences                            121
heart, lung, and blood institute                         110
aging                                                     89
national institute of mental health                       78
clinical center                                           75
neurological disorders and stroke                         75
cancer epidemiology and genetics                          63
national eye institute                                    59
human genome research                                     57
national institute on drug abuse                          50
dental &craniofacial  research                            41
alcohol abuse and alcoho

### Filling missing NIH information

The remaining grants (except for 'children s hospital medic', which seems like an outlier) seem to all be associated with different NIH centers. Most information of these organizations, that is, DUNS number, city, state and zipcode in addition to country, are missing. There are some NIH centers where all this information is available; however, the name of the center is listed differently.

In [34]:
pi_unique.ix[pi_unique['org_zipcode'].str.contains('20892', na = False)]['org_name'].value_counts()

u.s. national inst allergy & infect dis     4
u.s. national inst/child hlth/human dev     4
u.s. national institute of mental health    3
u.s. national inst diabetes/digst/kidney    3
u.s. national inst alcohol ab/alcoholism    1
u.s. national inst/neuro/ds/stroke          1
u.s. national heart lung and blood inst     1
Name: org_name, dtype: int64

Replace/fill the corresponding NIH centers in the pi_unique dataframe with the information above.

In [36]:
dict_nih_names = {
    'national institute of allergy and infectious diseases':'u.s. national inst allergy & infect dis',
    'child health and human development':'u.s. national inst/child hlth/human dev',
    'diabetes, digestive, kidney diseases':'u.s. national inst diabetes/digst/kidney',
    'national institute of mental health':'u.s. national institute of mental health',
    'neurological disorders and stroke':'u.s. national inst/neuro/ds/stroke',
    'alcohol abuse and alcoholism':'u.s. national inst alcohol ab/alcoholism',
    'heart, lung, and blood institute':'u.s. national heart lung and blood inst'
}

pi_unique = replace_nih_info(pi_unique, dict_nih_names)

### Adding missing NIH information

In [38]:
pi_unique.ix[pi_unique['org_country'].isnull()]['org_name'].value_counts(dropna = False)

basic sciences                                         447
niaid extramural activities                            203
NaN                                                    158
environmental health sciences                          121
aging                                                   89
clinical center                                         75
cancer epidemiology and genetics                        63
national eye institute                                  59
human genome research                                   57
national institute on drug abuse                        50
dental &craniofacial  research                          41
arthritis, musculoskeletal, skin dis                    30
translational science                                   29
national library of medicine                            27
deafness &other communication disorders                 21
biomedical imaging &bioengineering                      20
clinical sciences                                       

No information about NIH city, state, country and zipcode is listed for the remaining NIH centers. Add this information as bethesda, md, united states and 20892, respectively. **Note:** This address may not be exact, as some NIH institutes/centers may be located elsewhere, but this information will represent general NIH information.

#### First clean NIH center names.

In [47]:
and_names = pd.DataFrame()
and_names['original'] = pi_unique.loc[pi_unique['org_name'].str.contains(' &[^ ]', na = False)]['org_name']
and_names['with_space'] = pi_unique.loc[pi_unique['org_name'].str.contains(' &[^ ]', na = False)]['org_name']\
.str.replace('&', '& ')

and_names = and_names.drop_duplicates()

no_space = and_names['original'].tolist()
with_space = and_names['with_space'].tolist()

In [49]:
pi_unique.replace(no_space, with_space, inplace = True)

In [50]:
pi_unique.ix[pi_unique['org_country'].isnull()]['org_name'].value_counts(dropna = False)

basic sciences                                         447
niaid extramural activities                            203
NaN                                                    158
environmental health sciences                          121
aging                                                   89
clinical center                                         75
cancer epidemiology and genetics                        63
national eye institute                                  59
human genome research                                   57
national institute on drug abuse                        50
dental & craniofacial  research                         48
deafness & other communication disorders                31
arthritis, musculoskeletal, skin dis                    30
translational science                                   29
national library of medicine                            27
biomedical imaging & bioengineering                     21
clinical sciences                                       

* 'clinical sciences' and 'clinical center' likely refer to the same center, as clinical sciences is listed in years < 2012 and afterwards clinical center is listed.
* Replace 'environmental health sciences' with 'national institute of environmental health sciences'
* Drop the 'children s hospital medic' entry.

In [68]:
a = ['clinical sciences', 'environmental health sciences']
b = ['clinical center', 'national institute of environmental health sciences']
pi_unique.replace(a, b, inplace = True)

In [54]:
to_drop = pi_unique.ix[pi_unique['org_name'] == 'children s hospital medic'].index
pi_unique.drop(to_drop, inplace = True)

In [69]:
pi_unique.ix[pi_unique['org_country'].isnull()]['org_name'].value_counts(dropna = False)

basic sciences                                         447
niaid extramural activities                            203
NaN                                                    158
national institute of environmental health sciences    126
clinical center                                         94
aging                                                   89
cancer epidemiology and genetics                        63
national eye institute                                  59
human genome research                                   57
national institute on drug abuse                        50
dental & craniofacial  research                         48
deafness & other communication disorders                31
arthritis, musculoskeletal, skin dis                    30
translational science                                   29
national library of medicine                            27
biomedical imaging & bioengineering                     21
national cancer institute                               

#### Add dummy DUNS numbers for NIH centers

In [77]:
nih_centers_list = pi_unique.ix[pi_unique['org_country'].isnull()]['org_name'].unique().tolist()
nih_centers_list.remove(np.nan)
nih_centers_list

['aging',
 'niaid extramural activities',
 'arthritis, musculoskeletal, skin dis',
 'complementary & alternative medicine',
 'basic sciences',
 'cancer epidemiology and genetics',
 'computer research and technology',
 'national institute on drug abuse',
 'deafness & other communication disorders',
 'dental & craniofacial  research',
 'biomedical imaging & bioengineering',
 'national institute of environmental health sciences',
 'national eye institute',
 'human genome research',
 'national library of medicine',
 'national institute of nursing research',
 'clinical center',
 'general medical sciences',
 'complementary & integrative health',
 'natl inst on  min  hlth &  hlth  disp',
 'translational science',
 'national cancer institute']

In [83]:
dummy_duns = []
for i in range(len(nih_centers_list)):
    j = str(i)
    if len(j) == 1.0:
        dummy_duns.append('00000000' + j)
    elif len(j) == 2.0:
        dummy_duns.append('0000000' + j)
dummy_duns

['000000000',
 '000000001',
 '000000002',
 '000000003',
 '000000004',
 '000000005',
 '000000006',
 '000000007',
 '000000008',
 '000000009',
 '000000010',
 '000000011',
 '000000012',
 '000000013',
 '000000014',
 '000000015',
 '000000016',
 '000000017',
 '000000018',
 '000000019',
 '000000020',
 '000000021']

In [87]:
dummy_info = dict(zip(nih_centers_list, dummy_duns))
dummy_info

{'aging': '000000000',
 'arthritis, musculoskeletal, skin dis': '000000002',
 'basic sciences': '000000004',
 'biomedical imaging & bioengineering': '000000010',
 'cancer epidemiology and genetics': '000000005',
 'clinical center': '000000016',
 'complementary & alternative medicine': '000000003',
 'complementary & integrative health': '000000018',
 'computer research and technology': '000000006',
 'deafness & other communication disorders': '000000008',
 'dental & craniofacial  research': '000000009',
 'general medical sciences': '000000017',
 'human genome research': '000000013',
 'national cancer institute': '000000021',
 'national eye institute': '000000012',
 'national institute of environmental health sciences': '000000011',
 'national institute of nursing research': '000000015',
 'national institute on drug abuse': '000000007',
 'national library of medicine': '000000014',
 'natl inst on  min  hlth &  hlth  disp': '000000019',
 'niaid extramural activities': '000000001',
 'trans

#### Add NIH info

In [114]:
pi_unique = add_nih_info(pi_unique, col = 'org_duns', replace_with = dummy_info)
pi_unique = add_nih_info(pi_unique, col = 'org_city', replace_with = 'bethesda')
pi_unique = add_nih_info(pi_unique, col = 'org_state', replace_with = 'md')
pi_unique = add_nih_info(pi_unique, col = 'org_zipcode', replace_with = '20892')
pi_unique = add_nih_info(pi_unique, col = 'org_country', replace_with = 'united states')

In [115]:
to_keep = [col for col in pi_unique.columns if '_copy' not in col]
pi_unique = pi_unique[to_keep]
#pi_unique.filter(regex = '_copy')

## Investigating other missing information

In [131]:
pi_unique.shape

(132771, 9)

In [130]:
pi_unique.isnull().sum()

fy                 0
pi_ids             0
pi_names        5815
org_duns        5986
org_name         313
org_city         171
org_state       5729
org_zipcode    10155
org_country      158
dtype: int64

Proportion of null values per column.

In [133]:
pi_unique.isnull().sum() / len(pi_unique)

fy             0.000000
pi_ids         0.000000
pi_names       0.043797
org_duns       0.045085
org_name       0.002357
org_city       0.001288
org_state      0.043149
org_zipcode    0.076485
org_country    0.001190
dtype: float64

In [119]:
pi_unique.ix[pi_unique['org_country'].isnull()]['org_name'].value_counts(dropna = False)

NaN    158
Name: org_name, dtype: int64

In [127]:
pi_unique.ix[pi_unique['org_country'].isnull()].count()

fy             158
pi_ids         158
pi_names       158
org_duns        67
org_name         0
org_city         0
org_state        0
org_zipcode      0
org_country      0
dtype: int64

Entries where country is USA but zipcode is not listed.

In [135]:
pi_unique.ix[(pi_unique['org_country'] == 'united states') & \
            pi_unique['org_zipcode'].isnull()].count()

fy             8880
pi_ids         8880
pi_names       3292
org_duns       8879
org_name       8880
org_city       8880
org_state      5676
org_zipcode       0
org_country    8880
dtype: int64

In [136]:
pi_unique.ix[(pi_unique['org_country'] == 'united states') & \
            pi_unique['org_zipcode'].isnull()]

Unnamed: 0,fy,pi_ids,pi_names,org_duns,org_name,org_city,org_state,org_zipcode,org_country
1569500,2009,9626486,,073133571,university of michigan,ann arbor,mi,,united states
1569500,2009,9600839,,073133571,university of michigan,ann arbor,mi,,united states
1569500,2009,9619410,,073133571,university of michigan,ann arbor,mi,,united states
1569510,2009,9479933,,004514360,university of pittsburgh at pittsburgh,pittsburgh,pa,,united states
1569600,2009,9789510,,073757627,children's hosp of philadelphia,philadelphia,pa,,united states
1570979,2009,9457395,,175303262,university of puerto rico mayaguez,mayaguez,pr,,united states
1571954,2009,9667316,"hapko, michael k.",020232971,va puget sound healthcare system,seattle,,,united states
1571978,2009,9566724,,040077133,james j peters va medical center,bronx,ny,,united states
1571979,2009,9724692,,067445429,edward hines jr va hospital,hines,il,,united states
1571991,2009,6970036,"suris, alina",007369325,va north texas health care system,dallas,,,united states


Grant entries where the DUNS number is missing.

In [137]:
pi_unique.ix[pi_unique['org_duns'].isnull()]

Unnamed: 0,fy,pi_ids,pi_names,org_duns,org_name,org_city,org_state,org_zipcode,org_country
1569491,2009,,",",,"iq solutions, inc",rockville,md,208523003,united states
1569504,2009,8413109,"sanchez, jorge luis",,investigaciones medicas en salud,lima,,,peru
1569604,2009,10040631,"freimund, jeremy",,lummi nation,bellingham,wa,982269291,united states
1571042,2009,10739518,"arnold, melissa",,american academy of pediatrics,worthington,oh,430165210,united states
1571045,2009,9804838,"martin, patti",,arkansas children's hosp (little rock),little rock,ar,72202,united states
1571046,2009,9310735,"kelso warren, michele",,local government commission,sacramento,ca,95814,united states
1571048,2009,9807402,"monroe, timothy",,"techsoft group, llc",duluth,ga,300968688,united states
1571050,2009,9809463,"renner, megan",,"u.s. breastfeeding committee, inc",washington,dc,20036,united states
1571054,2009,10132911,"gilman, iliana",,austin-travis county mh-mr center,austin,tx,78702,united states
1571055,2009,10132913,"stone, george",,corporation for supportive housing,new york,ny,100041607,united states


In [145]:
null_duns = pi_unique.ix[pi_unique['org_duns'].isnull()]['org_name'].unique().tolist()
null_duns.remove(np.nan)
sorted(null_duns)

['20th judicial cir adult drug treat court',
 '211 brevard, inc',
 '211 maine,  inc',
 '22 judicial district court of louisiana',
 '24th judicial district attorney',
 '2nd judicial district court',
 '41b district court',
 'a global healthcare public foundation',
 'a voice for all, inc',
 'aamft research and education foundation',
 'abc unified school district',
 'access/wholistic/productive living inst',
 'acclaro research solutions, inc',
 'aconda',
 'act missouri',
 'actigraph, llc',
 'action for  betterment of the community',
 'action for a better community',
 'action for boston community development',
 'action network',
 'adair county environment hlth initiative',
 'adamhs board for montgomery county',
 'adams county health department',
 'adams state college',
 'adapt, inc',
 'addiction resource council, inc',
 'addictions care center of albany, inc',
 'addis ababa city council hiv/aids',
 'addis, inc',
 'administrative office of the court',
 'adolescent pregnancy prev coal of nc',

In [None]:
#Get list of unique org_names where DUNS is not listed
#Compare names to full df to see whether DUNS is listed elsewhere (org_duns.notnull())
#All organizations must have a DUNS number to receive a grant, but not worth it to look everything up
#Add in dummy duns

## Final information 

In [128]:
pi_unique.head(10)

Unnamed: 0,fy,pi_ids,pi_names,org_duns,org_name,org_city,org_state,org_zipcode,org_country
1569491,2009,,",",,"iq solutions, inc",rockville,md,208523003.0,united states
1569492,2009,1859426.0,"park, no-hee",92530369.0,university of california los angeles,los angeles,ca,900952000.0,united states
1569493,2009,1864471.0,"forrest, john n",77470003.0,mount desert island biological lab,salsbury cove,me,4672.0,united states
1569494,2009,1889505.0,"macrina, francis l",105300446.0,virginia commonwealth university,richmond,va,232980568.0,united states
1569495,2009,1961084.0,"moore, holly marie",167204994.0,new york state psychiatric institute,new york,ny,10032.0,united states
1569496,2009,9977355.0,"kurose, james f",153926712.0,university of massachusetts amherst,amherst,ma,10039242.0,united states
1569497,2009,1928042.0,"braun, robert e",42140483.0,jackson laboratory,bar harbor,me,46091500.0,united states
1569498,2009,1882056.0,"chernoff, jonathan d",73724262.0,fox chase cancer center,philadelphia,pa,191112434.0,united states
1569499,2009,10070938.0,"lange, peter",44387793.0,duke university,durham,nc,27705.0,united states
1569500,2009,9626486.0,,73133571.0,university of michigan,ann arbor,mi,,united states


In [92]:
pi_unique.to_csv('pi_info.csv', index = False, compression = 'gzip')