In [1]:
import networkx as nx
import json
from tqdm import tqdm

Read the metadata JSON file in order to build a dictionary and assign to each article a unique identifier (different from the DOI for easiest management of the network).

- metadata_dict -> contains all the articles and their data
- nodes -> dictionary containing tuples to map from DOI to node_id and journal title
- journals_dict -> dictionary to map from Journal_title to unique_id of the journal 

In [3]:
# Read metadata JSON file in order to build a dictionary
metadata = open("../Data/metadata.json")
metadata_dict = json.load(metadata)

# Create a dict of pairs "doi: (node_id, journal_title)"
nodes = dict()

# Create a dict of pairs "Journal: unique_identifier"
journals_dict = {}

# Add a number as unique identifier of each one of the papers and to each Journal
i = 0
j = 0
for paper in metadata_dict:
    paper["node_id"] = i
    nodes[paper['id']] = (paper['node_id'], paper['source_title'])
    i+=1
    if paper['source_title'] not in journals_dict:
        journals_dict[paper['source_title']] = j
        j+=1

Network initialization

In [4]:
journals_network = nx.Graph()

In order:
- Read the JSON file containing citations' pairs;
- Create a dictionary called "journal_citations" to store the different citations from journal to journal. The structure of this dictonary will be: "citing_journal_id: list_of_cited_journal_ids" (obviously, in the list we have repetitions of cited journals if articles cites more than one paper of the target journal);
- Populate the network as said above. This is accomplished thanks to a temporary "memo" dict that stores each citations to every target journal and that is initialized every time the source journal changes.
- Populate the "weights" dictionary. Such dictionary will contain the weight of each specific path retrieved and will be used to assign edge attributes to the network.
- article_citations contains pairs of "source article:[list of cited articles]".

In [5]:
# Read citations JSON file in order to build a dictionary
citations = open('../Data/citations.json')
citations_dict = json.load(citations)
journal_citations = dict()
article_citations = dict()

In [6]:
edge_list = list()

# Iterate over citations_dict to build a journals citations' network
for citation_obj in tqdm(citations_dict):
    source = citation_obj['source']
    target = citation_obj['target']
    if source in nodes:
        if target in nodes:
            source_article = nodes[source][0]
            target_article = nodes[target][0]
            if source_article != target_article:
                if source_article not in article_citations:
                    article_citations[source_article] = list()
                article_citations[source_article].append(target_article)
                source_journal = nodes[source][1]
                target_journal = nodes[target][1]
                if source_journal in journals_dict:
                    if target_journal in journals_dict:
                        jorunal_source_id = journals_dict[source_journal]
                        journal_target_id = journals_dict[target_journal]
                        if jorunal_source_id not in journal_citations:
                            journal_citations[jorunal_source_id] = list()
                        journal_citations[jorunal_source_id].append(journal_target_id)

# Add nodes to the graph
for source_id in journal_citations:
    memo = dict()
    for target_id in journal_citations[source_id]:
        if target_id not in memo:
            memo[target_id] = 0
        memo[target_id] += 1
    for cited_journal in memo:
        edge_list.append((source_id, cited_journal))
        journals_network.add_edge(source_id, cited_journal)

100%|██████████| 189697/189697 [00:00<00:00, 258366.00it/s]


In [7]:
sigma = nx.sigma(journals_network, niter=3, nrand=5)
sigma

KeyboardInterrupt: 

Save

In [None]:
nx.write_gml(journals_network, "J_UndirNet.gml")

Test

In [9]:
for C in tqdm((journals_network.subgraph(c).copy() for c in nx.connected_components(journals_network))):
    print(nx.average_shortest_path_length(C))

4it [00:29,  7.26s/it]

2.989171113335879
1.0
1.0
1.0





ER Random Network

In [None]:
n = 5661
E = 39100
p = (2*E)/((n-1)*n)
p = 0.5

In [None]:
er_no_p_Random_network = nx.gnm_random_graph(n, E)
C_r = nx.average_clustering(er_no_p_Random_network)
L_r = nx.average_shortest_path_length(er_no_p_Random_network)
print("C_r: ", C_r)
print("L_r: ", L_r)

C_r:  0.002385061659264084
L_r:  3.588537217325411


In [None]:
i=0
er_clusters_sum = 0
er_shortest_sum = 0
for network in tqdm(range(0, 11)):
    G = nx.erdos_renyi_graph(n,p)
    er_clusters_sum+=nx.average_clustering(G)
    er_shortest_sum+=nx.average_shortest_path_length(G)
    i+=1

  0%|          | 0/11 [00:06<?, ?it/s]


KeyboardInterrupt: 

In [None]:
average_shortest_path = er_shortest_sum/i
average_clustering_coefficient = er_clusters_sum/i
print("Average shortest path: ", average_shortest_path)
print("Average clustering coefficient: ", average_clustering_coefficient)

Average shortest path:  3.588099945904976
Average clustering coefficient:  0.0024380069126278344


In [None]:
C = 0.585
C_r = 0.002
L = 2.989
L_r = 3.591
N = C/C_r
D = L/L_r
sigma_value = N/D
print(N, D, sigma_value)

292.5 0.8323586744639375 351.4110070257612


In [None]:
nx.write_gml(G, "ER_Random.gml")