# DCSNS Bibliographic Networks

In [1]:
from IPython.display import clear_output
!pip install crossrefapi
clear_output()

import csv, re, random, operator, os, math, re, string, copy, itertools, pickle, datetime, pandas as pd, numpy as np, matplotlib.pyplot as plt, networkx as nx
from urllib.request import urlopen
from zipfile import ZipFile
from io import BytesIO
from collections import Counter, OrderedDict
from itertools import combinations, permutations
import operator
from wordcloud import WordCloud
from networkx.algorithms import community
import pygraphviz
from networkx.drawing.nx_agraph import graphviz_layout
from networkx.algorithms import community
import community as louvain
import spacy 
nlp = spacy.load('en_core_web_lg')

# crossrefapi
from crossref.restful import Journals
journals = Journals()
from crossref.restful import Works
works = Works()

# Plotting-related
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
# from matplotlib import pyplot as plt
# from matplotlib.gridspec import GridSpec
from matplotlib.ticker import FuncFormatter
import matplotlib.colors as mcolors
import matplotlib._color_data as mcd
%matplotlib inline
import seaborn as sns
import plotly
from plotly import tools
# import plotly.plotly as py
import chart_studio.plotly as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly.offline import download_plotlyjs,init_notebook_mode,plot,iplot
#connects JS to notebook so plots work inline
init_notebook_mode(connected=True)

import bokeh
from bokeh.io import push_notebook, show, output_notebook, save
import bokeh.plotting as bp
from bokeh.plotting import figure, save, output_file, show #, from_networkx
from bokeh.models import (ColumnDataSource, LabelSet, Label, BoxSelectTool, Circle, EdgesAndLinkedNodes, HoverTool,MultiLine, NodesAndLinkedEdges, Plot, Range1d, TapTool,)
from holoviews.element.graphs import layout_nodes
# bokeh.sampledata.download()
from bokeh.sampledata.airport_routes import routes, airports

output_notebook()
import holoviews as hv
from holoviews import dim, opts
hv.extension('bokeh', 'matplotlib')
from holoviews.operation import  gridmatrix
from holoviews.operation.datashader import datashade, bundle_graph
from holoviews import Graph, Nodes
from holoviews.plotting.bokeh import GraphPlot, LabelsPlot
import hvplot.networkx as hvnx
import hvplot.pandas

import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning) 
warnings.simplefilter('ignore')

In [2]:
def g_diagnostics(G,st):
    print("The %s has %i nodes and %i edges" %(st,len(G.nodes()), len(G.edges())),'\n')
    if G.is_directed()==True:
        print("The %s is a directed graph" %st)
    else:
        print("The %s is an undirected graph" %st) 
    if G.is_multigraph()==True:
        print("The %s is a multigraph" %st)
    else:
        print("The %s is a simple graph" %st)
    if nx.is_weighted(G)==True:
        print("The %s is a weighted graph" %st)
    else:
        print("The %s is an unweighted graph" %st) 
    if nx.is_bipartite(G)==True:
        print("The %s is a bipartite graph" %st)
    else:
        print("The %s is not a bipartite graph" %st)    
    if nx.is_tree(G)==True:
        print("The %s is a tree \n" %st)
    else:
        print("The %s is not a tree \n" %st )   
    if G.is_directed()==False:
        if nx.is_connected(G)==True:
            print("The %s is a connected graph" %st)
        else:
            print("The %s is a disconnected graph and it has %i connected components" %(st,nx.number_connected_components(G)))   
            giant = max(nx.connected_components(G), key=len)
            Glcc=G.subgraph(giant)
            print("The largest connected component of this graph has %i nodes and %i edges" %(len(Glcc.nodes()),len(Glcc.edges())))
    else:
        if nx.is_strongly_connected(G)==True:
            print("The %s is a strongly connected graph" %st)
        else:
            print("The %s is not strongly connected and it has %i strongly connected components" %(st,nx.number_strongly_connected_components(G)))
            giant = max(nx.strongly_connected_components(G), key=len)
            Glcc=G.subgraph(giant)
            print("The largest strongly connected component of this graph has %i nodes and %i edges" %(len(Glcc.nodes()),len(Glcc.edges())))
        if nx.is_weakly_connected(G)==True:
            print("The %s is a weakly connected graph" %st)
        else:
            print("The %s is not weakly connected and it has %i weakly connected components" %(st,nx.number_weakly_connected_components(G)))
            giant = max(nx.weakly_connected_components(G), key=len)
            Glwc=G.subgraph(giant)
            print("The largest weakly connected component of this graph has %i nodes and %i edges" %(len(Glwc.nodes()),len(Glwc.edges())))
    if len(list(nx.isolates(G)))>0:
        print("The %s has %i isolates \n" %(st,len(list(nx.isolates(G)))))
    else:
        print("The %s has no isolates \n" %st)
    print("The density of the %s is %.3f" %(st,nx.density(G)))  
    try:
        print("The transitivity of the %s is %.3f" %(st,nx.transitivity(G)) )
    except Exception as e: 
        print("Transitivity not computable: "+str(e))
    if G.is_directed()==True:
        print("The reciprocity of the %s is %.3f" %(st,nx.reciprocity(G)))
    if G.is_directed()==False:
        try:
            print("The average clustering of the %s is %.3f" %(st,nx.algorithms.cluster.average_clustering(G)))
        except Exception as e: 
            print("Average clustering not computable: "+str(e))
        if nx.is_connected(G)==True:
            print("The average shortest path length of the %s is %.3f" %(st,nx.algorithms.shortest_paths.generic.average_shortest_path_length(G)))  
    try:
        diameter=nx.diameter(G)
        print("The diameter of the %s is %i" %(st,diameter))
    except Exception as e: 
        print("Diameter not computable: "+str(e))

In [3]:
def hvnx_plot(G,ctype,pos,width,height,node_size,node_cmap,edge_color,edge_line_width,
              title,bundled,nodelabels,xoffset,yoffset,
              arrowhead_length,selection_mode,selection_policy,
              edge_hover_line_color,node_hover_fill_color,
              fontsize,text_font_size, text_color,bgcolor):
    if nx.is_directed(G)==True:
        in_neighbors={} #arriving_airlines={}
        for n in G.nodes():
            t=list(G.predecessors(n))
            in_neighbors[n]=", ".join(sorted([x for x in t]))
#         print(in_neighbors)
        out_neighbors={} #departing_airlines={}
        for n in G.nodes():
            t=list(G.successors(n))
            out_neighbors[n]=", ".join(sorted([x for x in t])) 
#         print(out_neighbors)
        if ctype==1:
            communities_generator = community.girvan_newman(G)
            top_level_communities = next(communities_generator)
            next_level_communities = next(communities_generator)
            lc=sorted(sorted(map(sorted, next_level_communities)), key=len,reverse=True)
            partition={n:i for i,c in enumerate(lc) for n in c }
            print("No. of Girvan-Newman communities",len(set(partition.values())))
            for n in G.nodes():
                G.nodes[n]['Girvan_Newman_community'] = partition[n]
#                 if (n,n) in G.edges():
#                     G.nodes[n]['in-degree'] = 0
#                     G.nodes[n]['in-neighbors'] = ""
#                     G.nodes[n]['out-degree'] = 0
#                     G.nodes[n]['out-neighbors'] = ""
#                 else:    
                G.nodes[n]['in-degree'] = G.in_degree(n)
                G.nodes[n]['in_neighbors'] = in_neighbors[n] 
#                     G.nodes[n]['arriving airlines'] = arriving_airlines[n] 
                G.nodes[n]['out-degree'] = G.out_degree(n)
                G.nodes[n]['out_neighbors'] = out_neighbors[n] 
#                     G.nodes[n]['departing airlines'] = departing_airlines[n] 
            graph = hvnx.draw(G, pos)
            graph.opts(edge_color=edge_color,edge_line_width=edge_line_width,node_size=node_size,node_color='Girvan_Newman_community',node_cmap=node_cmap)
            if bundled==0:                
                graph.opts(selection_policy=selection_policy,title=title,edge_hover_line_color=edge_hover_line_color,node_hover_fill_color=node_hover_fill_color,fontsize=fontsize,width=width,height=height,arrowhead_length=arrowhead_length) #,tools=tools) #,'box_zoom',"tap"])
                if nodelabels==1:
                    labels = hv.Labels(graph.nodes, ['x', 'y'], 'index')
                    graph=(graph * labels.opts(xoffset=xoffset, yoffset=yoffset,text_font_size=text_font_size, text_color=text_color, bgcolor=bgcolor))
                    print(labels)
                    return graph
                else:
                    return graph
            else:
                graph = bundle_graph(graph)
                graph.opts(selection_policy=selection_policy,title=title,edge_hover_line_color=edge_hover_line_color,node_hover_fill_color=node_hover_fill_color,fontsize=fontsize,width=width,height=height,arrowhead_length=arrowhead_length) #,tools=tools) #,'box_zoom',"tap"])
                if nodelabels==1:
                    labels = hv.Labels(graph.nodes, ['x', 'y'], 'index')
                    graph=(graph * labels.opts(xoffset=xoffset, yoffset=yoffset,text_font_size=text_font_size, text_color=text_color, bgcolor=bgcolor))
                    return graph
                else:
                    return graph
        if ctype==0:
            print("Louvain communities are not computable for directed graphs!")
    else:
        connections={}
        for n in G.nodes():
            t=list(G.neighbors(n))
            connections[n]=", ".join(sorted([x for x in t]))
#         connected_components={}
#         Gcc = sorted(connected_component_subgraphs(G), key = len, reverse=True)
#         ccl=[list(g.nodes) for g in Gcc]
#     #     ccl=sorted(ccl, key=lambda item: len(item[1]), reverse=True)
#         for i,j in enumerate(ccl):
#             for n in j:
#                 connected_components[n]=i 
        if ctype==1:
            communities_generator = community.girvan_newman(G)
            top_level_communities = next(communities_generator)
            next_level_communities = next(communities_generator)
            lc=sorted(sorted(map(sorted, next_level_communities)), key=len,reverse=True)
            partition={n:i for i,c in enumerate(lc) for n in c }
#             print("No. of connected components",len(ccl))
            print("No. of Girvan-Newman communities",len(set(partition.values())))
            for n in G.nodes():
                G.nodes[n]['Girvan-Newman_community'] = partition[n]
#                 if (n,n) in G.edges():
#                     G.nodes[n]['degree'] = 0
#                     G.nodes[n]['connections'] = ""
# #                     G.nodes[n]['connected_component'] = connected_components[n]
#                 else:    
                G.nodes[n]['degree'] = G.degree(n)
                G.nodes[n]['connections'] = connections[n] #", ".join([str(x) for x in list(G.neighbors(n))])
#                     G.nodes[n]['connected_component'] = connected_components[n]
    #         for n in G.nodes():
    #             G.nodes[n]['Louvain_community'] = partition[n]
            graph = hvnx.draw(G, pos)
            graph.opts(edge_color=edge_color,edge_line_width=edge_line_width,node_size=node_size,node_color='Girvan-Newman_community',node_cmap=node_cmap)
            graph.opts(padding=0.15)
            if bundled==0:
                graph.opts(selection_policy=selection_policy,title=title,edge_hover_line_color=edge_hover_line_color,node_hover_fill_color=node_hover_fill_color,fontsize=fontsize,width=width,height=height,arrowhead_length=arrowhead_length) #,tools=tools) #,'box_zoom',"tap"])
                if nodelabels==1:
                    labels = hv.Labels(graph.nodes, ['x', 'y'], 'index')
                    graph=(graph * labels.opts(xoffset=xoffset, yoffset=yoffset,text_font_size=text_font_size, text_color=text_color, bgcolor=bgcolor))            
                    return graph
                else:
                    return graph
            else:
                graph = bundle_graph(graph)
                graph.opts(selection_policy=selection_policy,title=title,edge_hover_line_color=edge_hover_line_color,node_hover_fill_color=node_hover_fill_color,fontsize=fontsize,width=width,height=height,arrowhead_length=arrowhead_length) #,tools=tools) #,'box_zoom',"tap"])
                if nodelabels==1:
                    labels = hv.Labels(graph.nodes, ['x', 'y'], 'index')
                    graph=(graph * labels.opts(xoffset=xoffset, yoffset=yoffset,text_font_size=text_font_size, text_color=text_color, bgcolor=bgcolor))
                    return graph
                else:
                    return graph
        if ctype==0:
            partition_l=louvain.best_partition(G)
#             print("No. of connected components",len(ccl))
            print("No. of Louvain communities",len(set(partition_l.values())))
            for n in G.nodes():
                G.nodes[n]['Louvain_community'] = partition_l[n]
                if (n,n) in G.edges():
                    G.nodes[n]['degree'] = 0
                    G.nodes[n]['connections'] = ""
        #             G.nodes[n]['connected_component'] = connected_components[n]
                else:    
                    G.nodes[n]['degree'] = G.degree(n)
                    G.nodes[n]['connections'] = connections[n] #", ".join([str(x) for x in list(G.neighbors(n))])  #connections          
        #             G.nodes[n]['connected_component'] = connected_components[n]
            graph = hvnx.draw(G, pos)
            graph.opts(edge_color=edge_color,edge_line_width=edge_line_width,node_size=node_size,node_color='Louvain_community',node_cmap=node_cmap)
            graph.opts(padding=0.15)
            if bundled==0:                
                graph.opts(selection_policy=selection_policy,title=title,edge_hover_line_color=edge_hover_line_color,node_hover_fill_color=node_hover_fill_color,fontsize=fontsize,width=width,height=height,arrowhead_length=arrowhead_length) #,tools=tools) #,'box_zoom',"tap"])
                if nodelabels==1:
                    labels = hv.Labels(graph.nodes, ['x', 'y'], 'index')
                    graph=(graph * labels.opts(xoffset=xoffset, yoffset=yoffset,text_font_size=text_font_size, text_color=text_color, bgcolor=bgcolor))
                    return graph
                else:
                    return graph
            else:
                graph = bundle_graph(graph)
                graph.opts(selection_policy=selection_policy,title=title,edge_hover_line_color=edge_hover_line_color,node_hover_fill_color=node_hover_fill_color,fontsize=fontsize,width=width,height=height,arrowhead_length=arrowhead_length) #,tools=tools) #,'box_zoom',"tap"])
                if nodelabels==1:
                    labels = hv.Labels(graph.nodes, ['x', 'y'], 'index')
                    graph=(graph * labels.opts(xoffset=xoffset, yoffset=yoffset,text_font_size=text_font_size, text_color=text_color, bgcolor=bgcolor))
                    return graph
                else:
                    return graph

In [4]:
node_list = [] # a list of titles and references
edge_list = [] # includes rows of format [a, b] where 'a' references 'b'
type_dict = {} # key: node, value: type ('title' or 'reference'), holds all possible node values

''' 
Requires: 'n_type' is either 'title' or 'reference' 
Modifies: If 'node' occurs in the list, preserves type 'title,' changing either 
          the 'node_list' value and the 'type_dict' type, or just the 'node' value.
          Else, adds 'node' to 'node_list.'
Effects:  Compares 'node' to the current 'note_list.' 
'''
def comp_add(node_list, node, n_type):
    for i in range(len(node_list)): 
        # check to see if 'node' compares to any current nodes
        if node in node_list[i] or node_list[i] in node: 
            # if a node exists as a row 'title' and a row 'reference', 
            # we want to favor the type 'title' in our data structures 
            if n_type == 'title': 
                # switch the representation in 'node_list' to 'title'
                node_list[i] = node
                type_dict[node] = n_type 
            else:
                # switch the representation of 'node' to 'title' 
                node = node_list[i]
            return node 
        
    # The rest of this function executes if 'node' is not already in 'node_list'
    if n_type == 'title':
        node_list.append(node)
        type_dict[node] = n_type
    else: 
        node_list.append(node)
        type_dict[node] = n_type

    return node

## 1. The Citation Network

In [5]:
def clean_auth(auth: list):
    if ',' in auth:
        auth = auth.split(',')[0].strip()
    if '.' in auth:
        auth = re.split('[A-Z]\.',auth)[0].strip()
    return auth

def shorten_auths(auths: list):
    if auths[0] == '[No author name available]':
        return 'Unknown'
    if '.' in auths:
        auths = auths.split('.,')
    else:
        auths = auths.strip(', ').split(',')
    
        
    if len(auths) == 1:
        return clean_auth(auths[0])
    elif len(auths) == 2:
        return clean_auth(auths[0]) + ' & ' + clean_auth(auths[1])
    else:
        return clean_auth(auths[0]) + ' et al.'

In [6]:
import re
res2 = re.compile('''
        (?P<ref_authors>([A-Z][a-z]+, ([A-Z]\.)+, )+).*(?=\()\(
        (?P<ref_year>\d{4})\) 
        (?P<ref_title>.*)
        (?=pp\. )pp\. (?P<pp>\d{1,5}-\d{1,5})(\. ?, )?
        (?P<pub>.+)
        ''', re.VERBOSE)

def ref_res(ref): 
    res = re.search(r'(?P<ref_authors>.*)(, )\((?P<ref_year>\d{4})\)(?P<ref_title>.*)', ref)
    if res and 'ref_authors' in res.groupdict() and 'ref_year' in res.groupdict() and 'ref_title' in res.groupdict(): 
        return res.groupdict()
    
    res = re.search(r'(?P<ref_authors>.*)(?P<ref_title>(?<=,).*)\((?P<ref_year>\d{4})\)', ref)
    if res and 'ref_authors' in res.groupdict() and 'ref_year' in res.groupdict() and 'ref_title' in res.groupdict(): 
        return res.groupdict()
    
    res = re.search(res2, ref)
    if res and 'ref_authors' in res.groupdict() and 'ref_year' in res.groupdict() and 'ref_title' in res.groupdict(): 
        return res.groupdict()
    
    else:
        return {'ref_authors':'unparseable', 'ref_year':'unparseable', 'ref_title':'unparseable'}

In [7]:
def source_check(title):
    for source in df.Title:
        if source in title or title in source:
            return source
        else:
            return None

In [None]:
datafile = 'SCOPUS Disease Control through Social Network Surveillance.csv' 
name = "Disease Control through Social Network Surveillance"
st = "Scopus bibliometric dataset on %s" %name

df = pd.read_csv(datafile)

In [20]:
df.DOI

0           10.1093/infdis/148.4.754
1       10.1016/0277-9536(84)90312-5
2       10.1016/0167-5877(85)90010-8
3      10.1016/s0749-3797(18)31265-0
4                                NaN
                   ...              
934           10.3390/ijerph17207434
935                 10.7326/M20-1509
936     10.1371/journal.pcbi.1008180
937              10.1093/cid/ciz1182
938       10.1016/j.jtbi.2020.110461
Name: DOI, Length: 939, dtype: object

In [23]:
# !conda install pymongo -y
from pymongo import MongoClient
import web_of_science_aux as wosa
papers_con = wosa.MongoConnection(wosa.merged_papers_settings)

In [24]:
result_query = papers_con.collection.find_one({"RS":'DOI 10.1016/0167-5877(85)90010-8'})
result_query

AutoReconnect: chicago.chem-eng.northwestern.edu:27017: [Errno 104] Connection reset by peer

In [None]:
result_query.keys()

In [9]:
from crossref.restful import Works
works = Works()

In [35]:
import traceback

nodes = pd.DataFrame()
edges = pd.DataFrame()
ref2source = 0
for index, row in df.iterrows():
    if index % 100 == 0:
        print('\n\n\n\nPROCESSING index: ', index, '\n\n\n\n')
    out = {}
    if pd.isnull(row.DOI):
        out['DOI'] = 'noDOI' + str(index)
    else:
        out['DOI'] = row.DOI
    out['auths'] = row.Authors
    out['yr'] = row.Year
    out['titl'] = row.Title
    out['abbrev'] = shorten_auths(row.Authors) + ' ({})'.format(row.Year)
    nodes = nodes.append(out, ignore_index=True)
    node_list.append(out['titl'])
    if pd.isnull(row.DOI):
        continue
    # iterate over this work's references
    try: # wrap in case DOI not found
        crw = works.doi(row.DOI)
        if 'reference' in crw:
            for r, ref in enumerate(crw['reference']):
                outref = {}
                outref['DOI'] = ref['key']
                # if this ref is not in list of original sources, add to total list
                if 'article-title' in ref:
                    outref['titl'] = ref['article-title']
                else:
                    continue
                orig = source_check(outref['titl'])
                if orig:
                    outref['titl'] = orig
                    ref2source += 1
                edges = edges.append({'source':row.Title, 'target':outref['titl']}, ignore_index=True)
                edge_list.append([row.Title, outref['titl']])
                if not orig:
                    try:
                        outref['auths'] = ref['author']
                    except:
                        if 'unstructured' in ref:
                            refres = ref_res(ref['unstructured'])
                            outref['auths'] = refres['ref_authors']
                            outref['yr'] = refres['ref_year']
                            outref['titl'] = refres['ref_title']
                        else:
                            outref['auths'] = 'unknown' + str(index) + str(r)
                    try:
                        outref['yr'] = ref['year']
                    except:
                        outref['yr'] = 1900
                    outref['abbrev'] = shorten_auths(outref['auths']) + ' ({})'.format(outref['yr'])
                    nodes = nodes.append(outref, ignore_index=True)
                    node_list.append(outref['titl'])
                    
                continue
    except: 
        print('\n\n', ref)
        print(traceback.print_exc())
        
        
print('{} references were recognized as sources'.format(ref2source))





PROCESSING index:  0 








PROCESSING index:  100 






 {'key': 'key-10.3201/eid0901.020047-201206210400-R38', 'unstructured': 'Lathrop S, Bunning M, Singer D, Tiwari T, Baber P, Reiter B, Dengue serosurvey in the Laredo/Nuevo Laredo border community—Texas and Mexico, 1999. In: Program of 49th annual Epidemic Intelligence Service conference. Atlanta, GA: Centers for Disease Control and Prevention; 2000; 40.'}
None


Traceback (most recent call last):
  File "<ipython-input-35-7c0a689ac0ac>", line 25, in <module>
    if 'reference' in crw:
TypeError: argument of type 'NoneType' is not iterable






PROCESSING index:  200 






 {'key': '10.1016/j.ajic.2009.02.013_bib13', 'doi-asserted-by': 'crossref', 'first-page': '474', 'DOI': '10.1016/S0196-6553(99)70024-6', 'article-title': 'Methodologies used in surveillance of surgical wound infections and bacteremia in Australian hospitals', 'volume': '27', 'author': 'Murphy', 'year': '1999', 'journal-title': 'Am J Infect Control'}
None


Traceback (most recent call last):
  File "<ipython-input-35-7c0a689ac0ac>", line 25, in <module>
    if 'reference' in crw:
TypeError: argument of type 'NoneType' is not iterable






PROCESSING index:  300 






 {}
None


Traceback (most recent call last):
  File "<ipython-input-35-7c0a689ac0ac>", line 28, in <module>
    outref['DOI'] = ref['key']
KeyError: 'key'




 {'key': '10.1016/j.amc.2011.05.067_b0350', 'doi-asserted-by': 'crossref', 'first-page': '5931', 'DOI': '10.1073/pnas.0608270104', 'article-title': 'Coarse-grained analysis of stochasticity-induced switching between collective motion states', 'volume': '104', 'author': 'Kolpas', 'year': '2007', 'journal-title': 'PNAS'}
None


Traceback (most recent call last):
  File "<ipython-input-35-7c0a689ac0ac>", line 25, in <module>
    if 'reference' in crw:
TypeError: argument of type 'NoneType' is not iterable






PROCESSING index:  400 






 {'key': 'e_1_3_2_38_2', 'DOI': '10.1016/j.physa.2008.11.021', 'doi-asserted-by': 'publisher'}
None


Traceback (most recent call last):
  File "<ipython-input-35-7c0a689ac0ac>", line 25, in <module>
    if 'reference' in crw:
TypeError: argument of type 'NoneType' is not iterable




 {}
None


Traceback (most recent call last):
  File "<ipython-input-35-7c0a689ac0ac>", line 28, in <module>
    outref['DOI'] = ref['key']
KeyError: 'key'




 {'key': '10.1016/j.jhin.2012.11.003_bib17', 'doi-asserted-by': 'crossref', 'first-page': '105', 'DOI': '10.1089/sur.2007.082', 'article-title': 'Surgical site infection in children: prospective analysis of the burden and risk factors in a sub-Saharan African setting', 'volume': '10', 'author': 'Ameh', 'year': '2009', 'journal-title': 'Surg Infect (Larchmt)'}
None


Traceback (most recent call last):
  File "<ipython-input-35-7c0a689ac0ac>", line 25, in <module>
    if 'reference' in crw:
TypeError: argument of type 'NoneType' is not iterable




 {'key': '10.1016/S0140-6736(13)61207-6_bib51', 'doi-asserted-by': 'crossref', 'first-page': '834', 'DOI': '10.1093/aje/kws314', 'article-title': 'Infection fatality risk of the pandemic A(H1N1)2009 virus in Hong Kong', 'volume': '177', 'author': 'Wong', 'year': '2013', 'journal-title': 'Am J Epidemiol'}
None


Traceback (most recent call last):
  File "<ipython-input-35-7c0a689ac0ac>", line 25, in <module>
    if 'reference' in crw:
TypeError: argument of type 'NoneType' is not iterable






PROCESSING index:  500 






 {'key': '10.1016/j.ajic.2013.12.016_bib26', 'doi-asserted-by': 'crossref', 'first-page': '10', 'DOI': '10.1097/SLA.0b013e3181ad5fca', 'article-title': 'Timing of antimicrobial prophylaxis and the risk of surgical site infections: results from the Trial to Reduce Antimicrobial Prophylaxis Errors', 'volume': '250', 'author': 'Steinberg', 'year': '2009', 'journal-title': 'Ann Surg'}
None


 {'key': '10.1016/j.ajic.2013.12.016_bib26', 'doi-asserted-by': 'crossref', 'first-page': '10', 'DOI': '10.1097/SLA.0b013e3181ad5fca', 'article-title': 'Timing of antimicrobial prophylaxis and the risk of surgical site infections: results from the Trial to Reduce Antimicrobial Prophylaxis Errors', 'volume': '250', 'author': 'Steinberg', 'year': '2009', 'journal-title': 'Ann Surg'}
None


Traceback (most recent call last):
  File "<ipython-input-35-7c0a689ac0ac>", line 25, in <module>
    if 'reference' in crw:
TypeError: argument of type 'NoneType' is not iterable
Traceback (most recent call last):
  File "<ipython-input-35-7c0a689ac0ac>", line 25, in <module>
    if 'reference' in crw:
TypeError: argument of type 'NoneType' is not iterable




 {'issue': '5', 'key': '10.1016/j.compag.2014.05.009_b0175', 'first-page': '32', 'article-title': 'Application of Zigbee wireless sensor network in Precision agriculture [J]', 'volume': '37', 'author': 'Yuanguai', 'year': '2009', 'journal-title': 'J. Qiongzhou Univ.'}
None


Traceback (most recent call last):
  File "<ipython-input-35-7c0a689ac0ac>", line 25, in <module>
    if 'reference' in crw:
TypeError: argument of type 'NoneType' is not iterable




 {'key': '10.1111/trf.12620-BIB0006|trf12620-cit-0006', 'doi-asserted-by': 'crossref', 'first-page': '159', 'DOI': '10.2307/2529310', 'article-title': 'The measurement of observer agreement for categorical data', 'volume': '33', 'author': 'Landis', 'year': '1977', 'journal-title': 'Biometrics'}
None


Traceback (most recent call last):
  File "<ipython-input-35-7c0a689ac0ac>", line 25, in <module>
    if 'reference' in crw:
TypeError: argument of type 'NoneType' is not iterable




 {'key': '10.1016/j.actatropica.2014.04.010_bib0215', 'author': 'World Health Organization', 'year': '2009', 'series-title': 'Dengue and Dengue Hemorrhagic Fever Fact Sheet'}
None


Traceback (most recent call last):
  File "<ipython-input-35-7c0a689ac0ac>", line 25, in <module>
    if 'reference' in crw:
TypeError: argument of type 'NoneType' is not iterable




 {}
None


Traceback (most recent call last):
  File "<ipython-input-35-7c0a689ac0ac>", line 28, in <module>
    outref['DOI'] = ref['key']
KeyError: 'key'






PROCESSING index:  600 








PROCESSING index:  700 






 {'key': 'key-10.3201/eid2313.170348-201711171403-R11', 'first-page': '1490', 'article-title': 'A review of sentinel laboratory performance: identification and notification of bioterrorism agents.', 'volume': '134', 'author': 'Wagar', 'year': '2010', 'journal-title': 'Arch Pathol Lab Med'}
None


Traceback (most recent call last):
  File "<ipython-input-35-7c0a689ac0ac>", line 25, in <module>
    if 'reference' in crw:
TypeError: argument of type 'NoneType' is not iterable




 {'key': '10.1111/cea.13025-BIB0024|cea13025-cit-0024', 'doi-asserted-by': 'crossref', 'first-page': '29', 'DOI': '10.1186/s13601-016-0116-9', 'article-title': 'Scaling up strategies of the chronic respiratory disease programme of the European Innovation Partnership on Active and Healthy Ageing (Action Plan B3: Area 5)', 'volume': '6', 'author': 'Bousquet', 'year': '2016d', 'journal-title': 'Clin Transl Allergy'}
None


Traceback (most recent call last):
  File "<ipython-input-35-7c0a689ac0ac>", line 25, in <module>
    if 'reference' in crw:
TypeError: argument of type 'NoneType' is not iterable




 {'issue': '4', 'key': '2020022405401886500_bib64', 'doi-asserted-by': 'crossref', 'first-page': '299', 'DOI': '10.1136/bmjqs-2012-001797', 'article-title': 'Surgical checklists: a systematic review of impacts and implementation', 'volume': '23', 'author': 'Treadwell', 'year': '2014', 'journal-title': 'BMJ Qual Saf'}
None


Traceback (most recent call last):
  File "<ipython-input-35-7c0a689ac0ac>", line 25, in <module>
    if 'reference' in crw:
TypeError: argument of type 'NoneType' is not iterable




 {'key': 'ref48', 'author': 'Alex', 'year': '2012', 'series-title': 'Supervised Sequence Labelling with Recurrent Neural Networks'}
None


Traceback (most recent call last):
  File "<ipython-input-35-7c0a689ac0ac>", line 25, in <module>
    if 'reference' in crw:
TypeError: argument of type 'NoneType' is not iterable






PROCESSING index:  800 






 {'key': 'e_1_2_9_32_1', 'DOI': '10.1002/mus.25441', 'doi-asserted-by': 'publisher'}
None


Traceback (most recent call last):
  File "<ipython-input-35-7c0a689ac0ac>", line 25, in <module>
    if 'reference' in crw:
TypeError: argument of type 'NoneType' is not iterable






PROCESSING index:  900 






 {'key': 'ref28', 'DOI': '10.1109/ACCESS.2017.2671678', 'doi-asserted-by': 'publisher'}
None


Traceback (most recent call last):
  File "<ipython-input-35-7c0a689ac0ac>", line 25, in <module>
    if 'reference' in crw:
TypeError: argument of type 'NoneType' is not iterable




 {'key': 'cit0019', 'DOI': '10.1586/eri.10.24', 'doi-asserted-by': 'publisher'}
None


Traceback (most recent call last):
  File "<ipython-input-35-7c0a689ac0ac>", line 25, in <module>
    if 'reference' in crw:
TypeError: argument of type 'NoneType' is not iterable




 {'key': 'ref67', 'unstructured': 'Weibo Posts Unaudited Earnings for the Second Quarter in 2018https://tech.sina.com.cn/i/2018-08-08/doc-ihhkuskt9159883.shtml'}
None


Traceback (most recent call last):
  File "<ipython-input-35-7c0a689ac0ac>", line 25, in <module>
    if 'reference' in crw:
TypeError: argument of type 'NoneType' is not iterable


1 references were recognized as sources


In [11]:
nodes["abbrev_dedup"] = ""
i = 0
for dup, count in zip(nodes.duplicated(subset=['abbrev']).values, nodes.groupby('abbrev').cumcount().values):
    abbrev = nodes.abbrev[i]
    if dup:
        abbrev = nodes.abbrev[i] + ' ' + str(count + 1)
        
    nodes['abbrev_dedup'][i] = abbrev
    i += 1

In [1]:
nodes

NameError: name 'nodes' is not defined

In [25]:
# Construction of the citation graph
G = nx.DiGraph()
for n in node_list: 
    G.add_node(n)
G.add_edges_from(edge_list)
nx.set_node_attributes(G, dict(zip(node_list,nodes["abbrev_dedup"])), "abbrev")

In [26]:
G.nodes(data=True)



In [13]:
nx.write_gpickle(G, "test.gpickle")

In [14]:
# Remove nodes that are isolated 
G0=nx.DiGraph(G)
nodes_to_remove = []
for n in G0.nodes(): 
    if G0.degree(n) == 0: 
        nodes_to_remove.append(n)
G0.remove_nodes_from(nodes_to_remove)

print("The unpruned citation graph of the %s has %i nodes and %i edges" %(st,len(G0.nodes()),len(G0.edges())))

# Remove nodes of degree 1
nodes_to_remove = []
for n in G.nodes(): 
    if G.degree(n) == 1: 
        nodes_to_remove.append(n)
G.remove_nodes_from(nodes_to_remove)

# Remove nodes that are isolated 
nodes_to_remove = []
for n in G.nodes(): 
    if G.degree(n) == 0: 
        nodes_to_remove.append(n)
G.remove_nodes_from(nodes_to_remove)

print("The pruned citation graph of the %s has %i nodes and %i edges" %(st,len(G.nodes()),len(G.edges())))

The unpruned citation graph of the Scopus bibliometric dataset on Disease Control through Social Network Surveillance has 18374 nodes and 18910 edges
The pruned citation graph of the Scopus bibliometric dataset on Disease Control through Social Network Surveillance has 697 nodes and 1556 edges


In [15]:
g_diagnostics(G0,st)

The Scopus bibliometric dataset on Disease Control through Social Network Surveillance has 18374 nodes and 18910 edges 

The Scopus bibliometric dataset on Disease Control through Social Network Surveillance is a directed graph
The Scopus bibliometric dataset on Disease Control through Social Network Surveillance is a simple graph
The Scopus bibliometric dataset on Disease Control through Social Network Surveillance is an unweighted graph
The Scopus bibliometric dataset on Disease Control through Social Network Surveillance is not a bipartite graph
The Scopus bibliometric dataset on Disease Control through Social Network Surveillance is not a tree 

The Scopus bibliometric dataset on Disease Control through Social Network Surveillance is not strongly connected and it has 18374 strongly connected components
The largest strongly connected component of this graph has 1 nodes and 0 edges
The Scopus bibliometric dataset on Disease Control through Social Network Surveillance is not weakly co

In [16]:
g_diagnostics(G,st)

The Scopus bibliometric dataset on Disease Control through Social Network Surveillance has 697 nodes and 1556 edges 

The Scopus bibliometric dataset on Disease Control through Social Network Surveillance is a directed graph
The Scopus bibliometric dataset on Disease Control through Social Network Surveillance is a simple graph
The Scopus bibliometric dataset on Disease Control through Social Network Surveillance is an unweighted graph
The Scopus bibliometric dataset on Disease Control through Social Network Surveillance is not a bipartite graph
The Scopus bibliometric dataset on Disease Control through Social Network Surveillance is not a tree 

The Scopus bibliometric dataset on Disease Control through Social Network Surveillance is not strongly connected and it has 697 strongly connected components
The largest strongly connected component of this graph has 1 nodes and 0 edges
The Scopus bibliometric dataset on Disease Control through Social Network Surveillance is not weakly connect

In [17]:
ctype=1
node_color="Girvan-Newman_community"  #'Louvain_community' "connected_component"
bundled=0
nodelabels=0
width=1000
height=1000
node_size=5*np.log(5+hv.dim('in-degree')) # #4*np.log(6+hv.dim('in-degree'))
node_cmap="tab10"
edge_line_width=1
edge_color='lightgreen'
arrowhead_length=0.01
selection_mode='nodes'
selection_policy="nodes"
edge_hover_line_color='green'
node_hover_fill_color='red'
fontsize={'title': '9pt'}
text_font_size='4pt'
text_color='black'
bgcolor='white'
xoffset=0
yoffset=-0.03 #-15
pos=graphviz_layout(G)
title="The citation graph of the %s" %st

citation=hvnx_plot(G,ctype,pos,width,height,node_size,node_cmap,edge_color,edge_line_width,
              title,bundled,nodelabels,xoffset,yoffset,
              arrowhead_length,selection_mode,selection_policy,
              edge_hover_line_color,node_hover_fill_color,
              fontsize,text_font_size, text_color,bgcolor)

hv.save(citation, 'DCSNS_citationGraph.html', backend='bokeh')

citation

No. of Girvan-Newman communities 18


## 2. The Co-Citation Network

In [18]:
G1=G.copy()
medgesout=[]
outtr_d={}
n_out=[n for n in G1.nodes() if G1.out_degree(n)>1]
for n in n_out:
    nn=list(G1.successors(n))
    combs=combinations(nn, 2)
    if n not in outtr_d:
        outtr_d[n]=[]
    for j in combs:
        medgesout.append(tuple(sorted((j[0],j[1]))))
        outtr_d[n].append(tuple(sorted((j[0],j[1]))))
Gcocitation=nx.MultiGraph()
Gcocitation.add_edges_from(medgesout)
weight=[(x, y, {'weight': v}) for (x, y), v in Counter(Gcocitation.edges()).items()]
Gcocitation = nx.Graph(weight)
edge_width=[Gcocitation[u][v]['weight'] for u,v in Gcocitation.edges()]

st2="co-citation graph of the %s" %st
print("The %s has %i nodes and %i edges" %(st2,len(Gcocitation.nodes()),len(Gcocitation.edges())))

The co-citation graph of the Scopus bibliometric dataset on Disease Control through Social Network Surveillance has 486 nodes and 7255 edges


In [19]:
outtr_d={}
for e in G1.edges():
    
    outtr_d[e]=[n for n in G1.nodes if #there exist a triangle with basis e[0], e[1] and 3rd vertex n, where (n,e[0]) and (n,e[1]) in G.edges]

SyntaxError: unexpected EOF while parsing (<ipython-input-19-d022d2a467b1>, line 4)

In [None]:
ctype=0
node_color='Louvain_community'  #"Girvan-Newman_community"  # "connected_component"
bundled=0
nodelabels=0
width=1000
height=1000
node_size=5 #15*np.log(5+hv.dim('in_degree')) #4*np.log(6+hv.dim('in-degree'))
node_cmap="tab10"
edge_line_width=1
edge_color='lightgreen'
arrowhead_length=0.04
selection_mode='nodes'
selection_policy="nodes"
edge_hover_line_color='green'
node_hover_fill_color='red'
fontsize={'title': '9pt'}
text_font_size='4pt'
text_color='black'
bgcolor='white'
xoffset=0
yoffset=-0.03 #-15
pos=graphviz_layout(Gcocitation)
title="The %s" %st2

cocitation=hvnx_plot(Gcocitation,ctype,pos,width,height,node_size,node_cmap,edge_color,edge_line_width,
              title,bundled,nodelabels,xoffset,yoffset,
              arrowhead_length,selection_mode,selection_policy,
              edge_hover_line_color,node_hover_fill_color,
              fontsize,text_font_size, text_color,bgcolor)

hv.save(cocitation, 'DCSNS_cocitationGraph.html', backend='bokeh')

cocitation

## 3. The Network of Bibliographic Coupling

In [None]:
G2=G.copy()
medgesin=[]
n_in=[n for n in G2.nodes() if G2.in_degree(n)>1]
for n in n_in:
    nn=list(G2.predecessors(n))
    combs=combinations(nn, 2)
    for j in combs:
        medgesin.append((j[0],j[1]))
Gbc=nx.MultiGraph()
Gbc.add_edges_from(medgesin)
weight=[(x, y, {'weight': v}) for (x, y), v in Counter(Gbc.edges()).items()]
Gbc = nx.Graph(weight)
edge_width=[Gbc[u][v]['weight'] for u,v in Gbc.edges()]

st4="graph of bibliographic coupling of the %s" %st
print("The %s has %i nodes and %i edges" %(st4,len(Gbc.nodes()),len(Gbc.edges())))

In [None]:
ctype=1
node_color="Girvan-Newman_community"  #'Louvain_community'  #"  # "connected_component"
bundled=0
nodelabels=0
width=1000
height=1000
node_size=5 #15*np.log(5+hv.dim('in_degree')) #4*np.log(6+hv.dim('in-degree'))
node_cmap="tab10"
edge_line_width=1
edge_color='lightgreen'
arrowhead_length=0.04
selection_mode='nodes'
selection_policy="nodes"
edge_hover_line_color='green'
node_hover_fill_color='red'
fontsize={'title': '9pt'}
text_font_size='4pt'
text_color='black'
bgcolor='white'
xoffset=0
yoffset=-0.03 #-15
pos=graphviz_layout(Gbc)
title="The %s" %st4

bibiocoupling=hvnx_plot(Gbc,ctype,pos,width,height,node_size,node_cmap,edge_color,edge_line_width,
              title,bundled,nodelabels,xoffset,yoffset,
              arrowhead_length,selection_mode,selection_policy,
              edge_hover_line_color,node_hover_fill_color,
              fontsize,text_font_size, text_color,bgcolor)

hv.save(bibiocoupling, 'DCSNS_bibiocouplingGraph.html', backend='bokeh')

bibiocoupling

In [None]:
import pickle

nodes_dict = dict(zip(nodes.abbrev, (nodes.titl, nodes.yr)))

with open('abbrev_dict.pickle', 'wb') as handle:
    pickle.dump(nodes_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
doi_dict = dict(zip(nodes.DOI, nodes.abbrev))

with open('doi_dict.pickle', 'wb') as handle:
    pickle.dump(doi_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
nodes