In [37]:
import igraph as ig

In [38]:
import pandas as pd
import numpy as np
pd.options.display.max_columns = 999

In [39]:
study_pub = pd.read_csv("../data/study_author_pub.csv", header=0)
comp_pub = pd.read_csv("../data/comp_author_pub.csv", header=0)

In [40]:
# create author full names to expand author lists of publications
study_pub['author_name'] = study_pub['last_name'].str.capitalize() + ', ' + study_pub['first_name'].str.capitalize() + ' ' +  study_pub['middle_initial'].str.upper()
comp_pub['author_name'] = comp_pub['last_name'].str.capitalize() + ', ' + comp_pub['first_name'].str.capitalize() + ' ' +  comp_pub['middle_initial'].str.upper()
study_pub['author_name_no_mi'] = study_pub['last_name'].str.capitalize() + ', ' + study_pub['first_name'].str.capitalize() 
comp_pub['author_name_no_mi'] = comp_pub['last_name'].str.capitalize() + ', ' + comp_pub['first_name'].str.capitalize() 
study_authors_df = study_pub[['author_name','author_name_no_mi']]
comp_authors_df = comp_pub[['author_name','author_name_no_mi']]
study_authors_df = study_authors_df.drop_duplicates('author_name')
comp_authors_df = comp_authors_df.drop_duplicates('author_name')
study_authors = study_authors_df.values[:,0]
study_authors_nomi = study_authors_df.values[:,1]
comp_authors = comp_authors_df.values[:,0]
comp_authors_nomi = comp_authors_df.values[:,1]
#combine all publications
all_pub = pd.concat([study_pub, comp_pub], axis=0)
all_pub = all_pub.drop_duplicates('PMID')
temp_df = all_pub[['AUTHOR_LIST','PMID']]
temp_df = all_pub[['AUTHOR_LIST','PMID']]

In [None]:
#expand the author list to a long format
vals = temp_df.values
numRow, numCol = vals.shape
authors = []
pmid = []
group = []
for i in range(numRow):
    if vals[i, 0] is not None and vals[i, 1] is not None:
        al = vals[i, 0].strip().split(';')
        al = [a.strip() for a in al ]
        al = [a for a in al if  a != '']
        for a in al:
            if a in study_authors:
                group.append('study')
                authors.append(a)
            elif a in study_authors_nomi:
                group.append('study')
                authors.append(str(study_authors[study_authors_nomi==a][0]))
            elif a in comp_authors:
                group.append('comp')
                authors.append(a)
            elif a in comp_authors_nomi:
                group.append('comp')
                authors.append(str(comp_authors[comp_authors_nomi==a][0]))
            elif a == 'Nelson, William James':
                group.append('study')
                authors.append('Nelson, William J')
            elif a == 'Macmillan, David W C':
                group.append('study')
                authors.append('Macmillan, David W')
            elif a == 'Pendergast, Ann Marie':
                group.append('comp')
                authors.append('Pendergast, Ann M')
            elif a == 'Au, Jessie L-S':
                group.append('comp')
                authors.append('Au, Jessie L') 
            elif a == 'Peterlin, Boris Matija':
                group.append('comp')
                authors.append('Peterlin, Boris M') 
            elif a == 'Yan, Shi Du':
                group.append('comp')
                authors.append('Yan, Shi D')     
            else:
                group.append('others')
                authors.append(a)
            pmid.append(vals[i, 1])
df_net = pd.DataFrame({'authors': authors, 'pmid': pmid, 'group': group})
df_net = df_net[df_net.authors != 'nan']
df_net.to_csv('../data/net_raw.csv', index=False, header=True)


In [43]:
df_net.shape

(382676, 3)

### Creating a edge list

In [5]:
# the whole set of authors
df = pd.read_csv('net_raw.csv', header=0)
df = df.drop_duplicates()
df.to_csv('../data/net_raw.csv', index=False, header=True)
df_net = df[['authors', 'pmid']]
df_group = df[['authors', 'group']]
df_group = df_group.drop_duplicates()
df_group.columns = ['Id', 'Group']
## study group
df_study = df[df.group=='study']
df_study_gr = df_study[['authors', 'group']]
df_study_gr = df_study_gr.drop_duplicates()
df_study_gr.columns = ['Id', 'Group']
## comp group
df_comp = df[df.group=='comp']
df_comp_gr = df_comp[['authors', 'group']]
df_comp_gr = df_comp_gr.drop_duplicates()
df_comp_gr.columns = ['Id', 'Group']
df_group.to_csv('../data/all_nodes_gephi.csv', header=True)
df_study_gr.to_csv('../data/study_nodes_gephi.csv', header=True)
df_comp_gr.to_csv('../data/comp_nodes_gephi.csv', header=True)



In [43]:
#unique publications
#looping over publications to create edge list
def getEdges(df):
    uniq_pub = df.pmid.unique()
    edges_list = np.empty((0,2), 'object')
    pmids = np.empty(0, np.float64)
    for idx, p in enumerate(uniq_pub):
        if (idx % 1000) == 0:
            print(idx)
        df_p = df[df.pmid == p]
        a4p = df_p.values[:,0]
        len_a = len(a4p)
        for i in range(len_a):
            if len_a == 1:
                edges[:,0] = a4p[i]
                edges[:,1] = a4p[i]
                edges_list = np.r_[edges_list, edges]
                num_row = 1
                e_ids = np.ones(num_row) * p
                pmids = np.r_[pmids, e_ids]
            else:
                if i == (len_a - 1):
                    pass
                else:
                    num_row = len_a - i - 1
                    edges = np.empty((num_row, 2), 'object')
                    edges[:,0] = a4p[i]
                    edges[:,1] = a4p[i+1::]
                    e_ids = np.ones(num_row) * p
                    edges_list = np.r_[edges_list, edges]
                    pmids = np.r_[pmids, e_ids]
    return edges_list, pmids

In [None]:
study_edges, study_pmid = getEdges(df_study)
comp_edges, comp_pmid = getEdges(df_comp)

In [None]:
#full network take 3-4 hours to run
all_edges, all_pmid = getEdges(df_net)

In [48]:
study_edges

array([['Kellogg, Douglas R', 'Sullivan, William T'],
       ['Stivers, James T', 'Cole, Philip A'],
       ['Stivers, James T', 'Greenberg, Marc M'],
       ..., 
       ['Glick, Benjamin S', 'Rothman, James E'],
       ['Linstedt, Adam D', 'Rothman, James E'],
       ['Cane, David E', 'Puglisi, Joseph D']], dtype=object)

In [45]:
def createGraphCsv(edge_list, pmids, file_name):
    df_edges_raws = pd.DataFrame(edge_list, columns=['Source', 'Targe'])
    df_edges_pmids = pd.DataFrame(pmids, columns=['pmids'])
    df_gephi = pd.concat([df_edges_raws, df_edges_pmids], axis=1);
    df_gephi.columns = ['Source', 'Target', 'Label']
    ### undirect graph 
    df_gephi['Type'] = 'Undirected'
    pub_attr = all_pub[['PMID', 'PUB_YEAR','COUNTRY']]
    df_gephi = df_gephi.merge(pub_attr, left_on='Label', right_on='PMID', how='left')
    df_gephi.drop(['PMID'], axis=1, inplace=True)
    df_gephi.to_csv("../data/" + file_name + '.csv', index=False, header=True)

In [46]:
createGraphCsv(study_edges, study_pmid, 'study_edges')
createGraphCsv(comp_edges, comp_pmid, 'comp_edges')
createGraphCsv(all_edges, all_pmid, 'all_edges')

In [26]:
#add Funding Activity R01...
study_edges = pd.read_csv('../data/study_edges.csv', header=0)
activities = all_pub[['PMID','ACTIVITY']]
study_edges = study_edges.merge(activities, left_on='Label', right_on='PMID', how='left')
study_edges.to_csv('../data/study_edges.csv', index=False, header=True)
comp_edges = pd.read_csv('../data/comp_edges.csv', header=0)
comp_edges = comp_edges.merge(activities, left_on='Label', right_on='PMID', how='left')
comp_edges.to_csv('../data/comp_edges.csv', index=False, header=True)
all_edges = pd.read_csv('../data/all_edges.csv', header=0)
all_edges = all_edges.merge(activities, left_on='Label', right_on='PMID', how='left')
all_edges.to_csv('../data/all_edges.csv', index=False, header=True)

In [50]:
a = all_pub[all_pub['PMID'] == 15057822.0]['AUTHOR_LIST'].iloc[0]

In [53]:
al = a.split(';')
print(len(al))
from scipy.misc import comb
from scipy.special import perm
comb(231,2)

### Using both publication and author as nodes

In [74]:
df_net.columns = ['Source', 'Target']
df_net['Type'] = 'Undirected'
df_net.to_csv('../data/edge_pubnode_autnode.csv', index=False, header=True)
df_pub = pd.DataFrame({'Id': df_net.Target.unique()})
df_pub['Group'] = 'pub'
df_nodes = pd.concat([df_group, df_pub], axis=0)
df_nodes.to_csv('../data/node_pubnode_autnode.csv', index=False, header=True)

### Calculate Statstics among author groups

In [1]:
all_nodes = pd.read_csv('../data/nodes_gephi.csv', header=0)

NameError: name 'pd' is not defined

In [23]:
all_nodes = all_nodes.drop('Unnamed: 0', axis=1)

In [24]:
all_nodes[all_nodes.Group == 'study'].shape

(400, 2)

In [25]:
all_nodes.Group.unique()

array(['others', 'study', 'comp'], dtype=object)

In [26]:
all_nodes.shape

(139651, 2)

In [None]:
g = Graph(len(all_nodes))
g.vs['name'] = all_nodes.Id
g.vs['group'] = all_nodes.Group
edges = pd.read_csv('../data/edges_gephi.csv', header=0)
g.add_edges(edges[['Source', 'Target']].values.tolist())
g.es['pmid'] = edges['Label']
g.es['PUB_YEAR'] = edges['PUB_YEAR']
g.es['COUNTRY'] = edges['COUNTRY']

In [None]:
g.vs["bs"] = g.betweenness()

In [45]:
#average degree 
print('The average degree of the whole network: {0:0.2f}\n'.format(mean(g.degree())))
#study group degree
study_nodes = g.vs.select(lambda v : v['group'] == 'study')
print('The average degree of the study nodes: {0:0.2f}\n'.format(mean(study_nodes.degree())))
comp_nodes = g.vs.select(lambda v : v['group'] == 'comp')
print('The average degree of the comparison nodes: {0:0.2f}\n'.format(mean(comp_nodes.degree())))
other_nodes = g.vs.select(lambda v : v['group'] == 'others')
print('The average degree of the other nodes: {0:0.2f}\n'.format(mean(other_nodes.degree())))


The average degree of the whole network: 37.73

The average degree of the study nodes: 310.85

The average degree of the comparison nodes: 530.35

The average degree of the other nodes: 35.74



In [43]:
#nodes with degree 0
nodes_no_edges = g.vs.select(_degree=0)
len(nodes_no_edges)

7

In [49]:
#average degree 
print('The average betweenness of the whole network: {0:0.2f}\n'.format(mean(g.vs['bs'])))
#study group degree
study_nodes = g.vs.select(lambda v : v['group'] == 'study')
print('The average betweenness of the study nodes: {0:0.2f}\n'.format(mean(study_nodes['bs'])))
comp_nodes = g.vs.select(lambda v : v['group'] == 'comp')
print('The average betweenness of the comparison nodes: {0:0.2f}\n'.format(mean(comp_nodes['bs'])))
other_nodes = g.vs.select(lambda v : v['group'] == 'others')
print('The average betweenness of the other nodes: {0:0.2f}\n'.format(mean(other_nodes['bs'])))



The average betweenness of the whole network: 232397.24

The average betweenness of the study nodes: 12948939.37

The average betweenness of the comparison nodes: 21887753.12

The average betweenness of the other nodes: 143088.63



In [50]:
g.vs['closeness'] = g.closeness()

In [51]:
#average degree 
print('The average closeness of the whole network: {0:0.2f}\n'.format(mean(g.vs['closeness'])))
#study group degree
study_nodes = g.vs.select(lambda v : v['group'] == 'study')
print('The average closeness of the study nodes: {0:0.2f}\n'.format(mean(study_nodes['closeness'])))
comp_nodes = g.vs.select(lambda v : v['group'] == 'comp')
print('The average closeness of the comparison nodes: {0:0.2f}\n'.format(mean(comp_nodes['closeness'])))
other_nodes = g.vs.select(lambda v : v['group'] == 'others')
print('The average closeness of the other nodes: {0:0.2f}\n'.format(mean(other_nodes['closeness'])))




The average closeness of the whole network: 0.01

The average closeness of the study nodes: 0.01

The average closeness of the comparison nodes: 0.01

The average closeness of the other nodes: 0.01



In [28]:
g.write_pickle('all_author_graph.pkl')

In [30]:
import pickle as pk

In [32]:

g = pk.load('all_author_graph.pkl')

TypeError: file must have 'read' and 'readline' attributes

In [5]:
!ls

Untitled.ipynb
all_author_graph
data_exploration_only_fiscal_year_filter_by_cost.ipynb
data_preparation.ipynb
edges_gephi.csv
edges_pmids.csv
edges_raw.csv
net_raw.csv
network_analysis.ipynb
nih_analyses.db
sample_questions_v5.ipynb


In [None]:
am = g.get_adjacency()

In [2]:
import numpy as np