In [1]:
import networkx as nx
from networkx.algorithms import community # ~louvain
import community as community_louvain # python louvain
import itertools
from itertools import combinations
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# Fonts for plots
#mpl.rcParams['font.serif'] = 'Times New Roman'
#plt.rcParams['font.family'] = 'serif'

pd.set_option('display.max_columns', None)

# Path to data retrieval and storage
path = "C:/Users/kleinow/ownCloud/MA_Neuro"

In [2]:
refs = pd.read_csv(path + "/cn_refs_cleaned.csv")


In [3]:
refs.head()

Unnamed: 0,item_id_citing,item_id_cited,citing_pubyear,ref_seq_nr,ref_pubyear,ref_item_title,ref_source_title,ref_authors,ref_volume,ref_pages,ref_doi,title_lower,item_id_clear
0,WOS:000086279300008,WOS:A1995RP75600044,2000,5,1995.0,MODULATION OF MEMORY FIELDS BY DOPAMINE D1 REC...,NATURE,"{""WILLIAMS, GV""}",376,572,,modulation of memory fields by dopamine d1 rec...,WOS:A1995RP75600044
1,WOS:000086279300008,WOS:A1996VV46700007,2000,7,1996.0,Regional and cellular fractionation of working...,PROCEEDINGS OF THE NATIONAL ACADEMY OF SCIENCE...,"{""GoldmanRakic, PS""}",93,13473,,regional and cellular fractionation of working...,WOS:A1996VV46700007
2,WOS:000086279300008,WOS:A1985ARE2300019,2000,14,1985.0,PRIMATE FRONTAL EYE FIELDS .2. PHYSIOLOGICAL A...,JOURNAL OF NEUROPHYSIOLOGY,"{""BRUCE, CJ""}",54,714,,primate frontal eye fields .2. physiological a...,WOS:A1985ARE2300019
3,WOS:000086279300008,WOS:A1980JC97100014,2000,15,1980.0,DISSOCIATION OF VISUAL AND SACCADE-RELATED RES...,JOURNAL OF NEUROPHYSIOLOGY,"{""MAYS, LE""}",43,207,,dissociation of visual and saccade-related res...,WOS:A1980JC97100014
4,WOS:000086279300008,WOS:A1985ADU5200001,2000,16,1985.0,PRIMATE FRONTAL EYE FIELDS .1. SINGLE NEURONS ...,JOURNAL OF NEUROPHYSIOLOGY,"{""BRUCE, CJ""}",53,603,,primate frontal eye fields .1. single neurons ...,WOS:A1985ADU5200001


Implement the CoCit Value to relativize the absolute citation number in relation to the frequency of citation

In [4]:
# Define a function "gmuer" for Gmür's CoCit value:
# Formula: 
# cocit_value = (co_citations**2) / (min(citation_counts[a], citation_counts[b]) * ((citation_counts[a]+citation_counts[b])/2))
# Intuition: strength of the relationship between references A and B, 
# based on the number of co-citations and the relative citation counts of A and B

def gmuer(a_cited_by, b_cited_by): # Lists of IDs that !!!cite!!! reference A and reference B
    # Get the number of co-citations of citation A and B:
    # Where these sets of citing IDs are overlapping, the references are cited together
    # resulting set "co_citations" contains the paper IDs that cite both A and B
    """
    Calculate Gmür's Co-citation value for two references A and B.
    
    Parameters:
        a_cited_by (list): List of paper IDs that have cited reference A.
        b_cited_by (list): List of paper IDs that have cited reference B.
        
    Returns:
        cocit_value (float): The Co-citation value based on Gmür's formula.
    
    Formula:
        cocit_value = (co_citations ** 2) / (min_citations * mean_citations)
    
    Itntuition:
        The function measures the strength of the relationship between 
        references A and B based on the number of co-citations and their 
        individual citation counts.
        
        - co_citations: Number of papers that cite both A and B.
        - min_citations: Minimum of the citation counts of A and B.
        - mean_citations: Mean of the citation counts of A and B.
        
    Example:
        gmuer([1,2,3], [3,4,5]) should return a float value representing
        the strength of the relationship between the papers in lists 
        a_cited_by and b_cited_by.
    """
    co_citations = len(set(a_cited_by) & set(b_cited_by)) 

    # Get the absolute citation counts for A and B by simply counting lengths of lists
    a_citations = len(a_cited_by)
    b_citations = len(b_cited_by)
    
    
    # Problem: Error "ZeroDivisionError: float division by zero"
    # gmuer([], [3, 4, 5]) # test by Nikoleta
    # Must not happen, but if denominator ever turns zero, add:

    #if min(a_citations, b_citations) == 0:
        
     #   ipdb.set_trace()  # set breakpoint
        
     #   return 0

    # Calculate first part of the denominator of the Gmür formula
    # a weak relationship with for example low min for A or B can limit the usefulness of the CoCit value 
    mincit = min(a_citations, b_citations)

    # Calculate mean of A and B's absolute citation counts as the second part of denominator the Gmür formula
    mean_citations = (a_citations + b_citations) / 2

    # Calculate the CoCit value with the Gmür formula
    cocit_value = (co_citations ** 2) / (mincit * mean_citations)
    
    return cocit_value

# function takes two lists of reference IDs, a_cited_by and b_cited_by, 
# which represent the papers that have cited references A and B



Graph

In [5]:
G_RefNet = nx.Graph()

# Add lonely nodes to the graph for each unique reference number in the "item_id_clear" column
G_RefNet.add_nodes_from(refs["item_id_clear"].unique())

# Group 'refs' by "item_id_clear" and create a dictionary where
# the keys are the 'item_id_clear' and the values are lists of 'item_id_citing' 
citing_papers = refs.groupby("item_id_clear")["item_id_citing"].apply(list).to_dict()

# Iterate over all pairs of nodes (cited IDs) in the Ggmuer graph
# Generate all possible combinations of 2 nodes
for nodeA, nodeB in itertools.combinations(G_RefNet.nodes, 2):
    
    # list of citing papers for each node
    a_cited_by = citing_papers.get(nodeA, [])
    b_cited_by = citing_papers.get(nodeB, [])
    
    # Calculate CoCit value
    cocit_value = gmuer(a_cited_by, b_cited_by)
    
    # Add an edge with the CoCit value as the weight, only if the value is greater than zero
    if cocit_value > 0:
        G_RefNet.add_edge(nodeA, nodeB, weight=cocit_value)

Graph metrics

In [6]:
# Function for used networks metrics
def calculate_metrics(G):
    """
    Calculate various basic and advanced metrics for the given network graph.

    Parameters:
    - G (networkx.Graph): The input graph for which metrics are to be calculated.

    Returns:
    - metrics (dict): A dictionary containing various network metrics.

    Metrics Calculated:
    - # Nodes: The total number of nodes in the graph.
    - # Edges: The total number of edges in the graph.
    - # Isolated nodes: The number of nodes that have no connections.
    - % Isolated nodes: The percentage of nodes that are isolated.
    - # Connected components: The number of connected components in the graph.
    - Size of largest components: The size (number of nodes) of the largest connected component in the graph.
    - Av. degree: The average degree of the graph.
    - # Communities: The number of communities detected by the Louvain method.
    - Modularity: Modularity score indicating the strength and degree of division of the network into communities.
    - Clustering Coefficient: Average clustering coefficient of the graph.

    Example Usage:
    >>> G = nx.erdos_renyi_graph(100, 0.1)
    >>> metrics = calculate_metrics(G)
    >>> print(metrics)
    """
    metrics = {}
    # Basic metrics
    metrics['# Nodes'] = G.number_of_nodes()
    metrics['# Edges'] = G.number_of_edges()
    metrics['# Isolated nodes'] = len(list(nx.isolates(G)))  # Isolated nodes have no edges, so an author who has not collaborated with anyone else!
    metrics['% Isolated nodes'] = (metrics['# Isolated nodes'] / metrics['# Nodes']) * 100 if metrics['# Nodes'] != 0 else 0  # added if statement to avoid division by zero
    
    # Advanced metrics
    metrics['# Connected components'] = nx.number_connected_components(G)
    metrics['Size of largest components'] = len(max(nx.connected_components(G), key=len))
    metrics['Av. degree'] = sum(dict(G.degree()).values()) / metrics['# Nodes']
    
    # Number of communities and modularity
    partition = community_louvain.best_partition(G)  # Compute best partition of graph nodes for modularity
    metrics['# Communities'] = len(set(partition.values()))
    metrics['Modularity'] = community_louvain.modularity(partition, G)
    
    # Clustering Coefficient
    metrics['Clustering Coefficient'] = nx.average_clustering(G)
    
    return metrics

In [7]:

metrics= calculate_metrics(G_RefNet)

#Df with metrics side-by-side
df_metrics = pd.DataFrame({'Metric': list(metrics.keys()),
                           'G': list(metrics.values())})

In [8]:
df_metrics

Unnamed: 0,Metric,G
0,# Nodes,9650.0
1,# Edges,578434.0
2,# Isolated nodes,0.0
3,% Isolated nodes,0.0
4,# Connected components,5.0
5,Size of largest components,9620.0
6,Av. degree,119.882694
7,# Communities,30.0
8,Modularity,0.693456
9,Clustering Coefficient,0.594522


In [9]:
# ALTERNATIVE FOR NETWORK, DONT RUN, TAKES AGES

#G_RefNet = nx.Graph()

# Loop through each unique citing article
#for citing_article in refs['item_id_citing'].unique():
    
    # Find the set of references cited by this article
    #cited_articles = refs[refs['item_id_citing'] == citing_article]['item_id_clear'].dropna().tolist()
    
    # Generate all pairs of cited articles
    #cited_pairs = combinations(cited_articles, 2)
    
    # Loop through each pair to calculate Gmür's CoCit value
    #for a, b in cited_pairs:
        #a_cited_by = refs[refs['item_id_clear'] == a]['item_id_citing'].dropna().tolist()
        #b_cited_by = refs[refs['item_id_clear'] == b]['item_id_citing'].dropna().tolist()
        
        # Calculate the CoCit value using gmuer() function
        #cocit_value = gmuer(a_cited_by, b_cited_by)
        
        # Add edge to graph
        #if G_RefNet.has_edge(a, b):
        #    G_RefNet[a][b]['weight'] += cocit_value
        #else:
            #G_RefNet.add_edge(a, b, weight=cocit_value)

Central References based on Centrality Measures

In [10]:
# Function to calculate the top 10 authors by centrality
def top_10_central(G, centrality_type):
    """
    Function to calculate the top 10 most central authors in given network graph.

    Parameters:
    G (Graph): The network graph
    centrality_type (str): The type of centrality to calculate ('betweenness' or 'closeness')

    Returns:
    list: Sorted list of tuples with the top 10 authors and their centrality score
    """
    if centrality_type == 'betweenness': # Check type of centrality metric to calculate
        centrality_dict = nx.betweenness_centrality(G) # Betweenness ...
    elif centrality_type == 'closeness':
        centrality_dict = nx.closeness_centrality(G) # Closeness ...
    else:
        return "Invalid centrality type" # added this to avoid errors
        
    sorted_centrality = sorted(centrality_dict.items(), key=lambda x: x[1], reverse=True)[:10]
    return sorted_centrality



NOTE: Betweenness Centrality has a runtime complexity of 
O(nm) for unweighted graphs and O(nm+n^2logn) for weighted graphs (which is here), where 
n is the number of nodes and m is the number of edges. Therefore, this step takes a lot of time to compute!

In [13]:
degree_dict = dict(G_RefNet.degree()) # all nodes' degrees

# Sort nodes by degree in descending order
sorted_degree = sorted(degree_dict.items(), key=lambda x: x[1], reverse=True)

top_10_degree = sorted_degree[:10]

# into df
df_top_10_degree = pd.DataFrame(top_10_degree, columns=['Node (Reference ID)', 'Degree'])

# reference titles to df
top_10_nodes = [node for node, _ in top_10_degree]
df_top_10_degree['Reference Title'] = df_top_10_degree['Node (Reference ID)'].map(
    refs.set_index('item_id_clear')['ref_item_title'].to_dict()
)

df_top_10_degree


Unnamed: 0,Node (Reference ID),Degree,Reference Title
0,WOS:A1952UH81500008,2324,A QUANTITATIVE DESCRIPTION OF MEMBRANE CURRENT...
1,WOS:A1982NL32300028,1746,NEURAL NETWORKS AND PHYSICAL SYSTEMS WITH EMER...
2,WOS:A1997WN12300029,1598,A neural substrate of prediction and reward
3,WOS:000188260400014,1595,Simple model of spiking neurons
4,WOS:000273784600012,1384,The free-energy principle: a unified brain the...
5,WOS:A1972L403600001,1337,EXCITATORY AND INHIBITORY INTERACTIONS IN LOCA...
6,WOS:000078409300017,1326,Predictive coding in the visual cortex: a func...
7,WOS:000087725300001,1266,Dynamics of sparsely connected networks of exc...
8,WOS:000208047200002,1241,Distributed Hierarchical Processing in the Pri...
9,WOS:000311977900005,1229,Canonical Microcircuits for Predictive Coding


In [14]:
# Top 10 central nodes based on betweenness centrality for both graphs
top_10_betweenness = top_10_central(G_RefNet, 'betweenness')




In [15]:
top_10_betweenness

[('WOS:A1952UH81500008', 0.07333318343487984),
 ('WOS:A1982NL32300028', 0.030172162763455516),
 ('WOS:A1997WN12300029', 0.023856116902607836),
 ('WOS:000188260400014', 0.020148906427133392),
 ('WOS:000263556500012', 0.018364397625382212),
 ('WOS:A1972L403600001', 0.018020321538611973),
 ('WOS:000208047200002', 0.014422092971714033),
 ('WOS:000273784600012', 0.013586382699671143),
 ('WOS:000078409300017', 0.012510979464883156),
 ('WOS:A1997XM55000001', 0.01096313396256101)]

In [16]:
# Top 10 central nodes based on closeness centrality for both graphs
top_10_closeness = top_10_central(G_RefNet, 'closeness')

In [17]:
top_10_closeness

[('WOS:A1952UH81500008', 0.5610938135701811),
 ('WOS:A1982NL32300028', 0.5354044262375429),
 ('WOS:000188260400014', 0.5283538086899771),
 ('WOS:A1997WN12300029', 0.5283246982872944),
 ('WOS:A1972L403600001', 0.5235650163207423),
 ('WOS:000087725300001', 0.5161809373910963),
 ('WOS:000273784600012', 0.516125371328618),
 ('WOS:000263556500012', 0.5145467521954494),
 ('WOS:000078409300017', 0.5144915373921234),
 ('WOS:000362952700021', 0.513197392235183)]

In [18]:
titles_betweenness = [find_title(x[0]) for x in top_10_betweenness]
# Add titles for Closeness
titles_closeness = [find_title(x[0]) for x in top_10_closeness_original]

# Combine into df
df_top_10_centralities = pd.DataFrame({
    'Top 10 Betweenness': [f"{x[0]} ({x[1]:.3f})" for x in top_10_betweenness],
    'Title (Betweenness)': titles_betweenness,
    'Top 10 Closeness': [f"{x[0]} ({x[1]:.3f})" for x in top_10_closeness_original],
    'Title (Closeness)': titles_closeness,
})

NameError: name 'find_title' is not defined