In [1]:
import igraph as ig
import pandas as pd
import numpy as np
pd.options.display.max_columns = 999

### Combine publications

In [71]:
study_pub = pd.read_csv("../data/study_author_pub.csv", header=0)
comp_pub = pd.read_csv("../data/comp_author_pub.csv", header=0)
# create author full names to expand author lists of publications
study_pub['author_name'] = study_pub['last_name'].str.capitalize() + ', ' + study_pub['first_name'].str.capitalize() + ' ' +  study_pub['middle_initial'].str.upper()
comp_pub['author_name'] = comp_pub['last_name'].str.capitalize() + ', ' + comp_pub['first_name'].str.capitalize() + ' ' +  comp_pub['middle_initial'].str.upper()
study_pub['author_name_no_mi'] = study_pub['last_name'].str.capitalize() + ', ' + study_pub['first_name'].str.capitalize() 
comp_pub['author_name_no_mi'] = comp_pub['last_name'].str.capitalize() + ', ' + comp_pub['first_name'].str.capitalize() 
study_authors_df = study_pub[['author_name','author_name_no_mi']]
comp_authors_df = comp_pub[['author_name','author_name_no_mi']]
study_authors_df = study_authors_df.drop_duplicates('author_name')
comp_authors_df = comp_authors_df.drop_duplicates('author_name')
study_authors = study_authors_df.values[:,0]
study_authors_nomi = study_authors_df.values[:,1]
comp_authors = comp_authors_df.values[:,0]
comp_authors_nomi = comp_authors_df.values[:,1]
#combine all publications
all_pub = pd.concat([study_pub, comp_pub], axis=0)
all_pub = all_pub.drop_duplicates('PMID')
temp_df = all_pub[['AUTHOR_LIST','PMID']]
temp_df = all_pub[['AUTHOR_LIST','PMID']]

### Expand the author list column

In [72]:
#expand the author list to a long format
vals = temp_df.values
numRow, numCol = vals.shape
authors = []
pmid = []
group = []
for i in range(numRow):
    if vals[i, 0] is not None and vals[i, 1] is not None:
        al = vals[i, 0].strip().split(';')
        al = [a.strip() for a in al ]
        al = [a for a in al if  a != '']
        for a in al:
            if a in study_authors:
                group.append('study')
                authors.append(a)
            elif a in study_authors_nomi:
                group.append('study')
                authors.append(str(study_authors[study_authors_nomi==a][0]))
            elif a in comp_authors:
                group.append('comp')
                authors.append(a)
            elif a in comp_authors_nomi:
                group.append('comp')
                authors.append(str(comp_authors[comp_authors_nomi==a][0]))
            elif a == 'Nelson, William James':
                group.append('study')
                authors.append('Nelson, William J')
            elif a == 'Macmillan, David W C':
                group.append('study')
                authors.append('Macmillan, David W')
            elif a == 'Pendergast, Ann Marie':
                group.append('comp')
                authors.append('Pendergast, Ann M')
            elif a == 'Au, Jessie L-S':
                group.append('comp')
                authors.append('Au, Jessie L') 
            elif a == 'Peterlin, Boris Matija':
                group.append('comp')
                authors.append('Peterlin, Boris M') 
            elif a == 'Yan, Shi Du':
                group.append('comp')
                authors.append('Yan, Shi D')     
            else:
                group.append('others')
                authors.append(a)
            pmid.append(vals[i, 1])
df_net = pd.DataFrame({'authors': authors, 'pmid': pmid, 'group': group})
df_net = df_net[df_net.authors != 'nan']
df_net  = df_net.drop_duplicates()
df_net.to_csv('../data/net_raw.csv', index=False, header=True)
df_net.shape

### Creating a edge list

In [28]:
df = pd.read_csv('net_raw.csv', header=0)
df_other = df[df.group=='others']
df_study = df[df.group=='study']
df_comp = df[df.group=='comp']

##### Filter other authors by the number of their publications

In [44]:
def filter_by_pub_count(df, numPub):
    def get_num_pub(df):
        lenG = len(df['pmid'].unique())
        if lenG > numPub:
            return True
        else:
            return False
    d = df.groupby(['authors']).filter(get_num_pub)
    df = pd.concat([df_study, df_comp, d], axis=0)
    df.to_csv('../data/author_has_pub_gt_' + str(numPub) + '_raw.csv', header=True, index=False)
    df_node = df[['authors', 'group']]
    df_node = df_node.drop_duplicates()
    df_node.columns = ['Id', 'Group']
    df_node.to_csv('../data/author_has_pub_gt_' + str(numPub) + '_nodes_gephi.csv', header=True, index=False)
    return df


In [62]:
df_author_other_has_pub_gt_10 = filter_by_pub_count(df_other, 10)

##### combine study and comparision groups

In [111]:
df_study_comb = pd.concat([df_study, df_comp], axis=0)
df_study_comb.to_csv('../data/study_comb_raw.csv', header=True, index=False)
df_node = df_study_comb[['authors', 'group']]
df_node = df_node.drop_duplicates()
df_node.columns = ['Id', 'Group']
df_node.to_csv('../data/study_comb_nodes_gephi.csv', header=True, index=False)

##### study, comparison, and all groups

In [43]:
# the whole set of authors
df_net = df[['authors', 'pmid']]
df_group = df[['authors', 'group']]
df_group = df_group.drop_duplicates()
df_group.columns = ['Id', 'Group']
## study group

df_study_gr = df_study[['authors', 'group']]
df_study_gr = df_study_gr.drop_duplicates()
df_study_gr.columns = ['Id', 'Group']
## comp group

df_comp_gr = df_comp[['authors', 'group']]
df_comp_gr = df_comp_gr.drop_duplicates()
df_comp_gr.columns = ['Id', 'Group']
df_group.to_csv('../data/all_nodes_gephi.csv', header=True)
df_study_gr.to_csv('../data/study_nodes_gephi.csv', header=True)
df_comp_gr.to_csv('../data/comp_nodes_gephi.csv', header=True)


##### create edges

In [115]:
#unique publications
#looping over publications to create edge list
activities = all_pub[['PMID','ACTIVITY']]
def getEdges(df):
    uniq_pub = df.pmid.unique()
    edges_list = np.empty((0,2), 'object')
    pmids = np.empty(0, np.float64)
    for idx, p in enumerate(uniq_pub):
        if (idx % 1000) == 0:
            print(idx)
        df_p = df[df.pmid == p]
        a4p = df_p.values[:,0]
        len_a = len(a4p)
        for i in range(len_a):
            if len_a == 1:
                num_row = 1
                edges = np.empty((num_row, 2), 'object')
                edges[:,0] = a4p[i]
                edges[:,1] = a4p[i]
                edges_list = np.r_[edges_list, edges]
                e_ids = np.ones(num_row) * p
                pmids = np.r_[pmids, e_ids]
            else:
                if i == (len_a - 1):
                    pass
                else:
                    num_row = len_a - i - 1
                    edges = np.empty((num_row, 2), 'object')
                    edges[:,0] = a4p[i]
                    edges[:,1] = a4p[i+1::]
                    e_ids = np.ones(num_row) * p
                    edges_list = np.r_[edges_list, edges]
                    pmids = np.r_[pmids, e_ids]
    return edges_list, pmids

def createGraphCsv(edge_list, pmids, file_name):
    df_edges_raws = pd.DataFrame(edge_list, columns=['Source', 'Targe'])
    df_edges_pmids = pd.DataFrame(pmids, columns=['pmids'])
    df_gephi = pd.concat([df_edges_raws, df_edges_pmids], axis=1);
    df_gephi.columns = ['Source', 'Target', 'Label']
    ### undirect graph 
    df_gephi['Type'] = 'Undirected'
    pub_attr = all_pub[['PMID', 'PUB_YEAR','COUNTRY']]
    df_gephi = df_gephi.merge(pub_attr, left_on='Label', right_on='PMID', how='left')
    df_gephi = df_gephi.merge(activities, left_on='Label', right_on='PMID', how='left')
    df_gephi = df_gephi[df_gephi.Source != df_gephi.Target]
    df_gephi.to_csv("../data/" + file_name + '.csv', index=False, header=True)
    return df_gephi

def createEdges(df, fileName):
   edges, pmids = getEdges(df)
   df = createGraphCsv(edges, pmids, fileName) 
   return df

In [None]:
df_comp_edges  = createEdges(df_study_comb,  'study_comp_edges')

In [None]:
df_author_other_has_pub_gt_10_edges = createEdges(df_study_comb,  'author_other_has_pub_gt_10_edges')