In [1]:
import pandas as pd
import networkx as nx
import itertools
import matplotlib.pyplot as plt
import numpy as np

pd.set_option('display.max_columns', 50)

NaNs are excluded to avoid connections via NaN. The references have to be cited at least 10 times to be included

In [3]:
wos_refs = pd.read_csv('wos_refs_cleaned.csv', index_col = 0)
#wos_refs.head()

wos_refs = wos_refs[wos_refs["item_id_citing"].notna()] 

ref_counts = wos_refs["item_id_clear"].value_counts()
wos_refs = wos_refs[wos_refs["item_id_clear"].isin(ref_counts[ref_counts >= 10].index)]

num_rows, num_cols = wos_refs.shape
num_rows # 11548

CoCit-value relativizes by putting the absolute citation value in relation to the frequency of citation  

Gmür's CoCit value:  
Formula:  
cocit_value = (co_citations**2) / (min(citation_counts[a], citation_counts[b]) * ((citation_counts[a]+citation_counts[b])/2))  
Intuition: strength of the relationship between references A and B,  
based on the number of co-citations and the relative citation counts of A and B  

In [5]:


def gmuer(a_cited_by, b_cited_by): # Lists of IDs that !!!cite!!! reference A and reference B
    # Get the number of co-citations of citation A and B:
    # Where these sets of citing IDs are overlapping, the references are cited together
    # resulting set "co_citations" contains the paper IDs that cite both A and B
    co_citations = len(set(a_cited_by) & set(b_cited_by)) 

    # Get the absolute citation counts for A and B by simply counting lengths of lists
    a_citations = len(a_cited_by)
    b_citations = len(b_cited_by)
    
    # Check if denominator is zero (which must not happen theoretically)
        
    #if min(a_citations, b_citations) == 0:
        
     #   ipdb.set_trace()  # set breakpoint
        
     #   return 0

    # Calculate first part of the denominator of the Gmür formula
    mincit = min(a_citations, b_citations)

    # Calculate mean of A and B's absolute citation counts as the second part of denominator the Gmür formula
    mean_citations = (a_citations + b_citations) / 2

    # Calculate the CoCit value with the Gmür formula
    cocit_value = (co_citations ** 2) / (mincit * mean_citations)
    
    return cocit_value

# function takes two lists of reference IDs, a_cited_by and b_cited_by, 
# which represent the papers that have cited references A and B

A network is created. Nodes and edges with weights based on the cocit-value are added

In [6]:
# Create a network with weighted edges with the help of Gmür's CoCit value for weighting

# Create a plain undirected graph:

#Ggmuer = nx.DiGraph() # directed Graphs would have directions that are nonexistent in our case

Ggmuer = nx.Graph()

# Add lonely nodes to graph Ggmuer for each unique reference number in the "item_id_clear" column
# unique because every cocitation will later add another edge to another unique reference with a calculated weight
Ggmuer.add_nodes_from(wos_refs["item_id_clear"].unique())

# Group wos_refs by "item_id_clear" and 
# create dictionary of citing ID for each reference ID and 
# apply list function to each group of citing ID that cite one reference ID. 
# Results in pandas Series object, which is then converted to dict. 
citing_papers = wos_refs.groupby("item_id_clear")["item_id_citing"].apply(list).to_dict()
# Keys are "item_id_clear", values are lists with "item_id_citing"


# Create the bigger network:

# Initially, all pairs of nodes could have potential edges.
# Iterate over all pairs of nodes aka cited IDs in the Ggmuer graph
# itertools can generate all possible combinations of elements in a list, 
# We should rather use .combinations instead of .permutations to not get each pair twice!
# '2' indicates that we want to generate combinations of always 2 nodes aka references
for nodeA, nodeB in itertools.combinations(Ggmuer.nodes, 2):
    # Get the list of citing papers for each node
    a_cited_by = citing_papers.get(nodeA, [])
    b_cited_by = citing_papers.get(nodeB, [])

    # generate CoCit value for all existing pairs of nodes
    cocit_value = gmuer(a_cited_by, b_cited_by)

    # add edges with the CoCit value as the weight:
    # CoCit value could be 0 if there are no co-citations between two references, 
    # meaning no citing IDs cite both references
    if cocit_value > 0:
        Ggmuer.add_edge(nodeA, nodeB, weight=cocit_value)

Investigate created network (edge number high due to not yet applied threshold)

In [7]:
# Basic statistics for the network

gmuernodes = Ggmuer.number_of_nodes()
gmueredges = Ggmuer.number_of_edges()

print("Number of nodes:", gmuernodes)
print("Number of edges:", gmueredges)
#Number of nodes: 460
#Number of edges: 39146 

# Number of nodes without filtering min 10: 37881 
# Number of edges without filtering min 10: 443594


Number of nodes: 460
Number of edges: 39146


In [8]:
degrees = Ggmuer.degree() # numbers of edges = degrees
descending_nodes = sorted(degrees, key=lambda x: x[1], reverse=True) # sorts them from node with most edges to least 
descending_nodes # show node with max edges

[('ZOOREC:ZOOR12000000712', 440),
 ('WOS:A1981LG07300011', 423),
 ('WOS:000220697200042', 416),
 ('WOS:000248365100001', 414),
 ('WOS:A1973R135800019', 414),
 ('WOS:A1978FK24700004', 413),
 ('WOS:000242624600030', 412),
 ('000391330900109.9', 411),
 ('WOS:000237778900048', 389),
 ('WOS:000184733700003', 386),
 ('WOS:000274868700004', 378),
 ('WOS:A1971J062800002', 376),
 ('WOS:A1968C243100008', 373),
 ('WOS:000074893400016', 369),
 ('WOS:000188753800045', 368),
 ('WOS:000257466900043', 367),
 ('WOS:000220697200041', 363),
 ('WOS:000175565000057', 362),
 ('WOS:000231503600068', 359),
 ('WOS:A19647207B00008', 354),
 ('WOS:000314285400024', 344),
 ('WOS:000239327200025', 343),
 ('WOS:000235780700087', 340),
 ('WOS:000406169500001', 338),
 ('WOS:000226381300055', 333),
 ('WOS:000232829100043', 332),
 ('WOS:000074150100051', 328),
 ('WOS:000247602700042', 325),
 ('WOS:000239425600092', 321),
 ('WOS:000173159300032', 319),
 ('WOS:000272729200001', 315),
 ('WOS:000233724000078', 315),
 ('WOS:

Checking nodes / reference titles with most edges

In [9]:
#wos_refs[wos_refs.eq("ZOOREC:ZOOR12000000712").any(1)] # Smith: Evolution and the theory of games.
#wos_refs[wos_refs.eq("WOS:A1981LG07300011").any(1)] # Axelrod: The evolution of cooperation
#wos_refs[wos_refs.eq("WOS:000220697200042").any(1)] # Nowak: Emergence of cooperation and evolutionary stability in finite populations
#wos_refs[wos_refs.eq("WOS:000248365100001").any(1)] # Szabó: Evolutionary games on graphs
#wos_refs[wos_refs.eq("WOS:A1973R135800019").any(1)] # Smith: Logic of animal conflict

Draw network in notebook (too big)

In [10]:
# draw the network
#layoutnodes = nx.spring_layout(Ggmuer)  # position the nodes using the spring layout algorithm
#nx.draw_networkx(Ggmuer, layoutnodes, with_labels=True)  # draw the nodes and edges with labels

#plt.show()

Export the network

In [9]:
# export for Gephi tool
nx.write_gexf(Ggmuer, "Ggmuer.gexf")

In [12]:
# Clusters:
# Strongly/ weakly connected components (SCCs)
sccs = list(nx.connected_components(Ggmuer))
len(sccs)
# print nodes in each SCC
#for i, scc in enumerate(sccs):
#    print(f"SCC {i+1}: {scc}")

1

Louvain method is used in the following, but its limitations (modularity, splitting of large clusters) are not addressed here

In [34]:
import community


# LOUVAIN ALGORITHM
# community.best_partition(graph, partition=None, weight='weight', resolution=1.0, randomize=None, random_state=None)
# takes into account the edge weights, that's good for us to use!
# resolution could alter the number of the clusters
clustering = community.best_partition(Ggmuer)


# empty df for later cluster inserting
cluster_df = pd.DataFrame(columns=['Cluster', 'Members'])



# (print clusters and the IDs, this step is not really necessary)
for cluster in set(clustering.values()):
    members = [node for node, part in clustering.items() if part == cluster]
    #print(f"Cluster {cluster}: {members}") # replaced for creating df
    cluster_dict = {'Cluster': cluster, 'Members': members}
    cluster_df = cluster_df.append(cluster_dict, ignore_index=True)

# explode the 'Members' column to create one row per member
cluster_df = cluster_df.explode('Members')

# print the DataFrame
print(cluster_df)
cluster_df.to_csv('clusters_Louvain.csv', index=False)    
    
# num_clusters = len(set(clustering.values()))
#print("Number of clusters:", num_clusters)

# num_clusters
# This results in 8 clusters

# clusterdf = pd.DataFrame([{'Num_Clusters': num_clusters}])

# print(cluster_df)

# count the number of members in each cluster
nodesincluster = cluster_df.groupby('Cluster').count()
nodesincluster

   Cluster              Members
0        0  WOS:000223799100054
0        0  WOS:000231703400006
0        0  WOS:A1977EE49800008
0        0  BCI:BCI199242013327
0        0  WOS:000284554800007
..     ...                  ...
7        7  WOS:000249421700017
7        7  WOS:000278148500075
7        7  WOS:000236584400042
7        7  WOS:000299412600038
7        7  WOS:000303673900001

[460 rows x 2 columns]


  cluster_df = cluster_df.append(cluster_dict, ignore_index=True)
  cluster_df = cluster_df.append(cluster_dict, ignore_index=True)
  cluster_df = cluster_df.append(cluster_dict, ignore_index=True)
  cluster_df = cluster_df.append(cluster_dict, ignore_index=True)
  cluster_df = cluster_df.append(cluster_dict, ignore_index=True)
  cluster_df = cluster_df.append(cluster_dict, ignore_index=True)
  cluster_df = cluster_df.append(cluster_dict, ignore_index=True)
  cluster_df = cluster_df.append(cluster_dict, ignore_index=True)


Unnamed: 0_level_0,Members
Cluster,Unnamed: 1_level_1
0,8
1,58
2,46
3,41
4,52
5,112
6,88
7,55


In [16]:
# Assign cluster number as node attribute
nx.set_node_attributes(Ggmuer, clustering, "cluster")
# Returns the set of nodes in the component of graph containing node n.
nx.node_connected_component(Ggmuer, "WOS:A1973R135800019")

In [36]:
# Create empty dictionary for later storage of rankings within the clusters
pagerank_scores = {}

# iterate over the clusters and compute the PageRank scores for each node in the cluster
for cluster in set(clustering.values()): # set creates unique cluster identifiers
    members = [node for node, part in clustering.items() if part == cluster]
    # every subgraph only contains the nodes in the current cluster that we are iterating over
    subgraph = Ggmuer.subgraph(members)
    # pagerank scores peau a peau added to the pagerank_scores dictionary with dict method "update"
    pagerank_scores.update(nx.pagerank(subgraph, weight='weight'))

# gives tuple list with (node, PageRank score) in descending order
pagerank_scores_sorted = sorted(pagerank_scores.items(), key=lambda x: x[1], reverse=True)



# Now print the 5 most influential nodes in each cluster 
for cluster in set(clustering.values()):
    members = [node for node, part in clustering.items() if part == cluster]
    cluster_pagerank_scores = [(node, score) for node, score in pagerank_scores_sorted if node in members]
    print(f'Cluster {cluster}:')
    for node, score in cluster_pagerank_scores[:5]:
        print(f'{node} (PageRank score: {score:.2f})')
    print()

Cluster 0:
WOS:000284554800007 (PageRank score: 0.16)
WOS:000231703400006 (PageRank score: 0.16)
WOS:000185119300077 (PageRank score: 0.15)
WOS:000223799100054 (PageRank score: 0.14)
WOS:000303440400006 (PageRank score: 0.13)

Cluster 1:
WOS:A1993KH52900003 (PageRank score: 0.04)
WOS:000182785600006 (PageRank score: 0.04)
WOS:A1993KH52900002 (PageRank score: 0.04)
WOS:000167925900004 (PageRank score: 0.03)
WOS:A1990EE29600005 (PageRank score: 0.03)

Cluster 2:
WOS:A1971J062800002 (PageRank score: 0.04)
WOS:000224352400009 (PageRank score: 0.04)
WOS:000184849200010 (PageRank score: 0.03)
WOS:000225377400002 (PageRank score: 0.03)
WOS:000232829100043 (PageRank score: 0.03)

Cluster 3:
WOS:A1997XX37500037 (PageRank score: 0.04)
WOS:000071700200003 (PageRank score: 0.04)
WOS:000261140200009 (PageRank score: 0.04)
WOS:000270355300014 (PageRank score: 0.04)
WOS:000275898300043 (PageRank score: 0.03)

Cluster 4:
WOS:000406169500001 (PageRank score: 0.05)
WOS:000314285400024 (PageRank score: 0

In [47]:
# Check Cluster 0: VACCINATION

#wos_refs[wos_refs.eq("WOS:000284554800007").any(1)]
# https://doi.org/10.1098/rspb.2010.1107
# Imitation dynamics of vaccination behaviour on social networks

#wos_refs[wos_refs.eq("WOS:000231703400006").any(1)]
# https://doi.org/10.1098/rspb.2005.3153
# Imitation dynamics predict vaccinating behaviour

#wos_refs[wos_refs.eq("WOS:000185119300077").any(1)]
# https://doi.org/10.1073/pnas.1731324100
# Group interest versus self-interest in smallpox vaccination policy

#wos_refs[wos_refs.eq("WOS:000223799100054").any(1)]
# https://doi.org/10.1073/pnas.0403823101
# Vaccination and the theory of games

#wos_refs[wos_refs.eq("WOS:000303440400006").any(1)]
# https://doi.org/10.1371/journal.pcbi.1002452
# Evolutionary Game Theory and Social Learning Can Determine How Vaccine Scares Unfold


In [53]:
# Check Cluster 1: Stochastic stability? Markov?

#wos_refs[wos_refs.eq("WOS:A1993KH52900003").any(1)]
# https://doi.org/10.2307/2951778
# The evolution of Conventions

#wos_refs[wos_refs.eq("WOS:000182785600006").any(1)]
# https://www.jstor.org/stable/1555525
# Deterministic Approximation of Stochastic Evolution in Games

#wos_refs[wos_refs.eq("WOS:A1993KH52900002").any(1)]
# https://doi.org/10.2307/2951777
# Learning, Mutation, and Long Run Equilibria in Games

#wos_refs[wos_refs.eq("WOS:000167925900004").any(1)]
# https://doi.org/10.1006/jeth.2000.2696
# Potential games with continuous player sets

#wos_refs[wos_refs.eq("WOS:A1990EE29600005").any(1)]
# http://dx.doi.org/10.1016/0040-5809(90)90011-J
# Stochastic evolutionary game dynamics

In [None]:
# Check Cluster 2: 
wos_refs[wos_refs.eq("WOS:A1971J062800002").any(1)]
#
#
wos_refs[wos_refs.eq("WOS:000224352400009").any(1)]
#
#

wos_refs[wos_refs.eq("WOS:000184849200010").any(1)]
#
#

wos_refs[wos_refs.eq("WOS:000225377400002").any(1)]
#
#

wos_refs[wos_refs.eq("WOS:000232829100043").any(1)]
#
#

In [None]:
# Check Cluster 3:
wos_refs[wos_refs.eq("WOS:A1997XX37500037").any(1)]
#
#
wos_refs[wos_refs.eq("WOS:000071700200003").any(1)]
#
#

wos_refs[wos_refs.eq("WOS:000261140200009").any(1)]
#
#

wos_refs[wos_refs.eq("WOS:000167925900004").any(1)]
#
#

wos_refs[wos_refs.eq("WOS:A1990EE29600005").any(1)]
#
#
WOS:A1997XX37500037 (PageRank score: 0.04)
WOS:000071700200003 (PageRank score: 0.04)
WOS:000261140200009 (PageRank score: 0.04)
WOS:000270355300014 (PageRank score: 0.04)
WOS:000275898300043 (PageRank score: 0.03)

In [None]:
# Check Cluster 4:
wos_refs[wos_refs.eq("WOS:A1993KH52900003").any(1)]
#
#
wos_refs[wos_refs.eq("WOS:000182785600006").any(1)]
#
#

wos_refs[wos_refs.eq("WOS:A1993KH52900002").any(1)]
#
#

wos_refs[wos_refs.eq("WOS:000167925900004").any(1)]
#
#

wos_refs[wos_refs.eq("WOS:A1990EE29600005").any(1)]
#
#

WOS:000406169500001 (PageRank score: 0.05)
WOS:000314285400024 (PageRank score: 0.04)
WOS:000381839100001 (PageRank score: 0.03)
WOS:000272309700016 (PageRank score: 0.03)
WOS:A1982QX29600005 (PageRank score: 0.03)

In [None]:
# Check Cluster 5:
wos_refs[wos_refs.eq("WOS:A1993KH52900003").any(1)]
#
#
wos_refs[wos_refs.eq("WOS:000182785600006").any(1)]
#
#

wos_refs[wos_refs.eq("WOS:A1993KH52900002").any(1)]
#
#

wos_refs[wos_refs.eq("WOS:000167925900004").any(1)]
#
#

wos_refs[wos_refs.eq("WOS:A1990EE29600005").any(1)]
#
#
WOS:000220697200042 (PageRank score: 0.03)
WOS:000237778900048 (PageRank score: 0.02)
WOS:000236955300005 (PageRank score: 0.02)
WOS:000225241700007 (PageRank score: 0.02)
WOS:000274798000017 (PageRank score: 0.02)


In [None]:
# Check Cluster 6:
wos_refs[wos_refs.eq("WOS:A1993KH52900003").any(1)]
#
#
wos_refs[wos_refs.eq("WOS:000182785600006").any(1)]
#
#

wos_refs[wos_refs.eq("WOS:A1993KH52900002").any(1)]
#
#

wos_refs[wos_refs.eq("WOS:000167925900004").any(1)]
#
#

wos_refs[wos_refs.eq("WOS:A1990EE29600005").any(1)]
#
#
000391330900109.9 (PageRank score: 0.03)
WOS:000231503600068 (PageRank score: 0.03)
WOS:000248365100001 (PageRank score: 0.03)
WOS:000222502800002 (PageRank score: 0.02)
WOS:000220697200041 (PageRank score: 0.02)

In [None]:
# Check Cluster 7:
wos_refs[wos_refs.eq("WOS:A1993KH52900003").any(1)]
#
#
wos_refs[wos_refs.eq("WOS:000182785600006").any(1)]
#
#

wos_refs[wos_refs.eq("WOS:A1993KH52900002").any(1)]
#
#

wos_refs[wos_refs.eq("WOS:000167925900004").any(1)]
#
#

wos_refs[wos_refs.eq("WOS:A1990EE29600005").any(1)]
#
#
WOS:000173159300032 (PageRank score: 0.04)
WOS:000247602700042 (PageRank score: 0.03)
WOS:000229048500073 (PageRank score: 0.03)
WOS:000181675200099 (PageRank score: 0.03)
WOS:000170966800048 (PageRank score: 0.03)

In [41]:
# Try to export the whole cluster rankings as a df
# transform list of tuples to dict
pagerank_dict = dict(pagerank_scores_sorted)

# dictionary that connects nodes and clusters
cluster_dict = {node: clustering[node] for node in pagerank_dict.keys()}

# give clusters to pagerank_dict
for node, score in pagerank_dict.items():
    pagerank_dict[node] = {"PageRank score": score, "Cluster": cluster_dict[node]}

# give it to a df
pagerank_df = pd.DataFrame.from_dict(pagerank_dict, orient='index')
pagerank_df.index.name = 'Node' # column for node names as Index

pagerank_df.to_csv('pagerank_scores.csv')
# CHECK AGAIN; PAGERANKS WRONG IN DF!!!