In [1]:
import pandas as pd
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [89]:
all_info = pd.read_csv('grants_all.csv')
pi_info_raw = all_info['pi_ids pi_names org_name org_city org_state org_country org_zipcode'.split()]

In [90]:
pi_info_raw.head()

Unnamed: 0,pi_ids,pi_names,org_name,org_city,org_state,org_country,org_zipcode
0,10799126;,"scott, stuart alexander;",icahn school of medicine at mount sinai,new york,ny,united states,100296574
1,7017365;,"polster, brian m;",university of maryland baltimore,baltimore,md,united states,212011508
2,7354078; 1862210 (contact);,"walton, maureen a; zucker, robert alpert (cont...",university of michigan,ann arbor,mi,united states,481091276
3,2275890 (contact); 8742217; 6139020;,"dulac, catherine g (contact); regev, aviv ; zh...",harvard university,cambridge,ma,united states,21385369
4,9868481; 2063694 (contact);,"chen, xuesong ; geiger, jonathan david (contact);",university of north dakota,grand forks,nd,united states,582026059


In [92]:
def strip_string(df, *args):
    '''
    
    Strips final character from each column in a df.
    
    '''
    for arg in args:
        df = df.apply(lambda s: s.str.strip(arg))
    return df

pi_info_cleaned = strip_string(pi_info_raw, ' ', ';', '.')

In [93]:
pi_info_cleaned.insert(2, 'contact', '')

In [94]:
pi_info_cleaned.ix[pi_info_cleaned['pi_ids'].str.contains('contact')].count()

pi_ids         9610
pi_names       9610
contact        9610
org_name       9610
org_city       9610
org_state      9494
org_country    9610
org_zipcode    9607
dtype: int64

In [95]:
pi_info_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71872 entries, 0 to 71871
Data columns (total 8 columns):
pi_ids         71872 non-null object
pi_names       71872 non-null object
contact        71872 non-null object
org_name       71766 non-null object
org_city       68527 non-null object
org_state      67740 non-null object
org_country    68528 non-null object
org_zipcode    68363 non-null object
dtypes: object(8)
memory usage: 4.4+ MB


In [96]:
pi_info_cleaned.head()

Unnamed: 0,pi_ids,pi_names,contact,org_name,org_city,org_state,org_country,org_zipcode
0,10799126,"scott, stuart alexander",,icahn school of medicine at mount sinai,new york,ny,united states,100296574
1,7017365,"polster, brian m",,university of maryland baltimore,baltimore,md,united states,212011508
2,7354078; 1862210 (contact),"walton, maureen a; zucker, robert alpert (cont...",,university of michigan,ann arbor,mi,united states,481091276
3,2275890 (contact); 8742217; 6139020,"dulac, catherine g (contact); regev, aviv ; zh...",,harvard university,cambridge,ma,united states,21385369
4,9868481; 2063694 (contact),"chen, xuesong ; geiger, jonathan david (contact)",,university of north dakota,grand forks,nd,united states,582026059


## Splitting PI Info

In [230]:
#one_pi = pd.DataFrame(columns = pi_info.columns)
#multi_pi = pd.DataFrame(columns = pi_info.columns)

multi_pi_unsplit = pi_info_cleaned.ix[pi_info_cleaned['pi_ids'].str.contains('contact')]

In [236]:
def split_rows(df, col_name):
    s = df[col_name].str.split(';').apply(pd.Series, 1).stack()
    s.index = s.index.droplevel(-1)
    s.name = col_name
    return s

def split_rows_pis(df, col1 = 'pi_ids', col2 = 'pi_names'):
    '''
    
    Different PIs are delimited by a ';'.
    Split on ';'
    
    '''
    df_copy = df.copy()
    x = split_rows(df_copy, col_name = col1)
    y = split_rows(df_copy, col_name = col2)
    
    #del df_copy[col1]
    del df_copy[col2]
    #df_new = df_copy.join(x)
    df_new = df_copy.join(y)
    return df_new

test = split_rows_pis(multi_pi_unsplit)
test.head()

Unnamed: 0,pi_ids,contact,org_name,org_city,org_state,org_country,org_zipcode,pi_names
2,7354078; 1862210 (contact),,university of michigan,ann arbor,mi,united states,481091276,"walton, maureen a"
2,7354078; 1862210 (contact),,university of michigan,ann arbor,mi,united states,481091276,"zucker, robert alpert (contact)"
3,2275890 (contact); 8742217; 6139020,,harvard university,cambridge,ma,united states,21385369,"dulac, catherine g (contact)"
3,2275890 (contact); 8742217; 6139020,,harvard university,cambridge,ma,united states,21385369,"regev, aviv"
3,2275890 (contact); 8742217; 6139020,,harvard university,cambridge,ma,united states,21385369,"zhuang, xiaowei"


In [99]:
pi_info = pi_info_cleaned.drop(multi_pi.index, axis = 0)

In [100]:
pi_info['contact'] = pi_info['contact'].replace('', 1)
pi_info.head()

Unnamed: 0,pi_ids,pi_names,contact,org_name,org_city,org_state,org_country,org_zipcode
0,10799126,"scott, stuart alexander",1,icahn school of medicine at mount sinai,new york,ny,united states,100296574
1,7017365,"polster, brian m",1,university of maryland baltimore,baltimore,md,united states,212011508
5,1866930,"so, peter t",1,massachusetts institute of technology,cambridge,ma,united states,21421029
7,11044822,"gade, terence p",1,university of pennsylvania,philadelphia,pa,united states,191046205
8,9335858,"qi, ling",1,cornell university,ithaca,ny,united states,148502820


In [206]:
multi_pi_unsplit.head()

Unnamed: 0,pi_ids,pi_names,contact,org_name,org_city,org_state,org_country,org_zipcode
2,7354078; 1862210 (contact),"walton, maureen a; zucker, robert alpert (cont...",,university of michigan,ann arbor,mi,united states,481091276
3,2275890 (contact); 8742217; 6139020,"dulac, catherine g (contact); regev, aviv ; zh...",,harvard university,cambridge,ma,united states,21385369
4,9868481; 2063694 (contact),"chen, xuesong ; geiger, jonathan david (contact)",,university of north dakota,grand forks,nd,united states,582026059
6,9851446; 2291297; 12280974; 9292719 (contact);...,"eden, uri tzvi; frank, loren m; ganguli, surya...",,cold spring harbor laboratory,cold spring harbor,ny,united states,117242209
12,10940848; 1897156 (contact),"lozano, andres m.; lyketsos, constantine g (co...",,johns hopkins university,baltimore,md,united states,212051832


In [227]:
multi_pi_ids = multi_pi_unsplit.copy()
del multi_pi_ids['pi_names']
del multi_pi_unsplit['pi_ids']
multi_pi_ids.sample()
multi_pi_unsplit.sample()

Unnamed: 0,pi_ids,contact,org_name,org_city,org_state,org_country,org_zipcode
54039,8803891 (contact); 7645499,,university of tx md anderson can ctr,houston,tx,united states,770304009


Unnamed: 0,pi_names,contact,org_name,org_city,org_state,org_country,org_zipcode
14939,"cotman, carl wayne (contact); tenner, andrea joan",,university of california-irvine,irvine,ca,united states,926173213


In [220]:
multi_pi_ids.shape
multi_pi_unsplit.shape

(9610, 7)

(9610, 7)

In [208]:
def split_rows_pis(df, col_name = 'pi_names'):
    '''
    
    Different PIs are delimited by a ';'.
    Split on ';'
    
    '''
    s = df[col_name].str.split(';').apply(pd.Series, 1).stack()
    s.index = s.index.droplevel(-1)
    s.name = col_name
    del df[col_name]
    df_new = df.join(s)
    return df_new

#multi_pi = split_rows_pis(multi_pi)
multi_pi = split_rows_pis(multi_pi_unsplit)
multi_pi_ids = split_rows_pis(multi_pi_ids, col_name = 'pi_ids')
multi_pi.head()
multi_pi_ids.head()

Unnamed: 0,contact,org_name,org_city,org_state,org_country,org_zipcode,pi_names
2,,university of michigan,ann arbor,mi,united states,481091276,"walton, maureen a"
2,,university of michigan,ann arbor,mi,united states,481091276,"zucker, robert alpert (contact)"
3,,harvard university,cambridge,ma,united states,21385369,"dulac, catherine g (contact)"
3,,harvard university,cambridge,ma,united states,21385369,"regev, aviv"
3,,harvard university,cambridge,ma,united states,21385369,"zhuang, xiaowei"


Unnamed: 0,contact,org_name,org_city,org_state,org_country,org_zipcode,pi_ids
2,,university of michigan,ann arbor,mi,united states,481091276,7354078
2,,university of michigan,ann arbor,mi,united states,481091276,1862210 (contact)
3,,harvard university,cambridge,ma,united states,21385369,2275890 (contact)
3,,harvard university,cambridge,ma,united states,21385369,8742217
3,,harvard university,cambridge,ma,united states,21385369,6139020


In [221]:
multi_pi.tail()
multi_pi_ids.tail()

Unnamed: 0,contact,org_name,org_city,org_state,org_country,org_zipcode,pi_names
71621,,emory university,atlanta,ga,united states,303224250,"smith, alicia k. (contact)"
71627,,new york university,new york,ny,united states,100122300,"pyle, anna marie"
71627,,new york university,new york,ny,united states,100122300,"schlick, tamar (contact)"
71821,,university of maryland baltimore,baltimore,md,united states,212011508,"kao, joseph pao yung"
71821,,university of maryland baltimore,baltimore,md,united states,212011508,"mayer, dirk (contact)"


Unnamed: 0,pi_ids,contact,org_name,org_city,org_state,org_country,org_zipcode
71539,1891094; 12301129 (contact),,johns hopkins university,baltimore,md,united states,212051832
71579,7356500 (contact); 2447996,,virginia commonwealth university,richmond,va,united states,232980568
71621,9140766; 8800131 (contact),,emory university,atlanta,ga,united states,303224250
71627,1885894; 2414701 (contact),,new york university,new york,ny,united states,100122300
71821,8080184; 9340781 (contact),,university of maryland baltimore,baltimore,md,united states,212011508


In [215]:
multi_pi.shape
multi_pi_ids.shape

(22336, 7)

(22329, 7)

In [213]:
#result = pd.concat([df1, s1], axis=1)
multi = pd.concat([multi_pi, multi_pi_ids['pi_ids']], axis = 1)
multi.head()

ValueError: Shape of passed values is (8, 93734), indices imply (8, 35055)

In [166]:
multi_stripped = strip_string(multi_pi, ' (contact)')
multi_stripped.head()

Unnamed: 0,pi_names,contact,org_name,org_city,org_state,org_country,org_zipcode,pi_ids
2,"walton, maureen a; zucker, robert alper",,university of michig,rbor,mi,united states,481091276,7354078
2,"walton, maureen a; zucker, robert alper",,university of michig,rbor,mi,united states,481091276,1862210
3,"dulac, catherine g (contact); regev, aviv ; zh...",,harvard university,mbridge,m,united states,21385369,2275890
3,"dulac, catherine g (contact); regev, aviv ; zh...",,harvard university,mbridge,m,united states,21385369,8742217
3,"dulac, catherine g (contact); regev, aviv ; zh...",,harvard university,mbridge,m,united states,21385369,6139020


In [159]:
multi_stripped['pi_ids'].isin(pi_info['pi_ids'])

2         True
2        False
3         True
3         True
3         True
4         True
4         True
6        False
6        False
6        False
6         True
6        False
6        False
6        False
12       False
12        True
18       False
18       False
18       False
19       False
19       False
33        True
33        True
33        True
33        True
33       False
35       False
35        True
36        True
36       False
         ...  
71149    False
71149     True
71177    False
71177     True
71245    False
71245    False
71246    False
71246    False
71272    False
71272    False
71374    False
71374    False
71384     True
71384     True
71425     True
71425    False
71465     True
71465    False
71512    False
71512     True
71539    False
71539    False
71579    False
71579    False
71621     True
71621    False
71627     True
71627    False
71821    False
71821     True
Name: pi_ids, dtype: bool

In [242]:
pi_info.shape
pi_unique = pi_info.drop_duplicates('pi_ids')
pi_unique.shape

(62262, 8)

(40104, 8)

In [243]:
multi_stripped.ix[multi_stripped['pi_ids'].isin(pi_unique['pi_ids']) == False]

Unnamed: 0,pi_names,contact,org_name,org_city,org_state,org_country,org_zipcode,pi_ids
2,"walton, maureen a; zucker, robert alper",,university of michig,rbor,mi,united states,481091276,1862210
6,"eden, uri tzvi; frank, loren m; ganguli, surya...",,ld spring harbor laboratory,ld spring harbor,y,united states,117242209,9851446
6,"eden, uri tzvi; frank, loren m; ganguli, surya...",,ld spring harbor laboratory,ld spring harbor,y,united states,117242209,2291297
6,"eden, uri tzvi; frank, loren m; ganguli, surya...",,ld spring harbor laboratory,ld spring harbor,y,united states,117242209,12280974
6,"eden, uri tzvi; frank, loren m; ganguli, surya...",,ld spring harbor laboratory,ld spring harbor,y,united states,117242209,10329759
6,"eden, uri tzvi; frank, loren m; ganguli, surya...",,ld spring harbor laboratory,ld spring harbor,y,united states,117242209,12572655
6,"eden, uri tzvi; frank, loren m; ganguli, surya...",,ld spring harbor laboratory,ld spring harbor,y,united states,117242209,11706552
12,"lozano, andres m.; lyketsos, constantine g",,johns hopkins university,baltimore,md,united states,212051832,10940848
18,"gefenas, eugenijus ; philpott, sean m. (contac...",,larkson university,potsdam,y,united states,136995630,9315706
18,"gefenas, eugenijus ; philpott, sean m. (contac...",,larkson university,potsdam,y,united states,136995630,10420866
