Import the required packages and libraries.

In [105]:
import networkx as nx
import json
from tqdm import tqdm
from numba import jit
import scipy.stats

Read the metadata JSON file in order to build a dictionary and assign to each article a unique identifier (different from the DOI for easiest management of the network).

- metadata_dict -> contains all the articles and their data
- nodes -> dictionary containing tuples to map from DOI to node_id and journal title
- journals_dict -> dictionary to map from Journal_title to unique_id of the journal 

In [2]:
# Read metadata JSON file in order to build a dictionary
metadata = open("../Data/metadata.json")
metadata_dict = json.load(metadata)

# Create a dict of pairs "doi: (node_id, journal_title)"
nodes = dict()
reverse_nodes = dict()

# Create a dict of pairs "Journal: unique_identifier"
journals_dict = {}

# Add a number as unique identifier of each one of the papers and to each Journal
i = 0
j = 0
for paper in metadata_dict:
    new_journal = False
    paper["node_id"] = i
    nodes[paper['id']] = (paper['node_id'], paper['source_title'])
    if paper['source_title'] not in journals_dict:
        journals_dict[paper['source_title']] = j
        reverse_nodes[paper['node_id']] = (paper['id'], paper['source_title'], j)
        new_journal = True
    else:
        idx = journals_dict[paper['source_title']]
        reverse_nodes[paper['node_id']] = (paper['id'], paper['source_title'], idx)
    i+=1
    # art_id : (doi, journ_title, journ_id)
    if new_journal:
        j+=1

Build the first network made up of articles.</br>
Also build the undirected network to analyze the structure.

In [3]:
undirected_papers_network = nx.Graph()
papers_network = nx.DiGraph()

Read citations JSON file in order to build a dictionary.

In [4]:
citations = open('../Data/citations.json')
citations_dict = json.load(citations)

Iterate over citations_dict to build a papers citations' network.

In [5]:
for citation_obj in tqdm(citations_dict):
    source = citation_obj['source']
    target = citation_obj['target']
    if source in nodes:
        if target in nodes:
            source_article_id = nodes[source][0]
            target_article_id = nodes[target][0]
            undirected_papers_network.add_edge(source_article_id, target_article_id)
            papers_network.add_edge(source_article_id, target_article_id)

100%|██████████| 189697/189697 [00:00<00:00, 250989.38it/s]


Save the undirected papers' network for the structural analysis.

In [6]:
nx.write_gml(undirected_papers_network, "../gml format networks/undirected_papers_network.gml")

Compute the <i>PageRank</i> value of the nodes of our network.

In [46]:
page_rank = nx.pagerank(papers_network, alpha=0.85)
pr_list = sorted(page_rank.items(), key=lambda item: item[1], reverse=True)
for article in pr_list[:10]:
    #print(article)
    print(reverse_nodes[article[0]])    

('10.1056/nejmoa030781', 'New England Journal Of Medicine', 112)
('10.1056/nejmoa030747', 'New England Journal Of Medicine', 112)
('10.1016/s0140-6736(03)13077-2', 'The Lancet', 82)
('10.1056/nejmoa1211721', 'New England Journal Of Medicine', 112)
('10.1126/science.1085952', 'Science', 44)
('10.1016/s0140-6736(03)13412-5', 'The Lancet', 82)
('10.1038/nm1080', 'Nature Medicine', 265)
('10.1126/science.1087139', 'Science', 44)
('10.1038/sj.cr.7290286', 'Cell Research', 457)
('10.1126/science.1085953', 'Science', 44)


Test to see which are the most important articles at this point, retrieved with the <i>Eigenvector Centrality</i> measure.

In [54]:
eigenvector_centrality = nx.eigenvector_centrality(papers_network, max_iter=1000)
ec_list = sorted(eigenvector_centrality.items(), key=lambda item: item[1], reverse=True)
for article in ec_list[:10]:
    #print(article)
    print(reverse_nodes[article[0]])

('10.1093/infdis/120.5.576', 'Journal Of Infectious Diseases', 414)
('10.1016/s0140-6736(75)93176-1', 'The Lancet', 82)
('10.1177/030098587301000105', 'Veterinary Pathology', 493)
('10.1016/0014-4800(76)90045-9', 'Experimental And Molecular Pathology', 2002)
('10.1093/oxfordjournals.aje.a121171', 'American Journal Of Epidemiology', 849)
('10.1007/bf01253886', 'Archiv F�R Die Gesamte Virusforschung', 613)
('10.1016/0042-6822(72)90062-1', 'Virology', 52)
('10.1177/0300985871008005-00612', 'Veterinary Pathology', 493)
('10.1016/0042-6822(77)90489-5', 'Virology', 52)
('10.1136/bmj.1.5448.1467', 'Bmj', 612)


Test to see which are the most important article at this point with In-Degree count.

In [52]:
in_degree_dict = {}
in_degree_iterable = papers_network.in_degree()
for tup in in_degree_iterable:
    node = tup[0]
    in_degree = tup[1]
    in_degree_dict[node] = in_degree
in_d_list = sorted(in_degree_dict.items(), key=lambda item: item[1], reverse=True)
for article in in_d_list[:10]:
    #print(article)
    print(reverse_nodes[article[0]])

('10.1056/nejmoa030781', 'New England Journal Of Medicine', 112)
('10.1056/nejmoa1211721', 'New England Journal Of Medicine', 112)
('10.1056/nejmoa030747', 'New England Journal Of Medicine', 112)
('10.1016/s0140-6736(03)13077-2', 'The Lancet', 82)
('10.1126/science.1085952', 'Science', 44)
('10.1126/science.1085953', 'Science', 44)
('10.1038/nature02145', 'Nature', 129)
('10.1126/science.1087139', 'Science', 44)
('10.1016/s0140-6736(03)13412-5', 'The Lancet', 82)
('10.1038/nm1024', 'Nature Medicine', 265)


Now we define a scaling function in order to scale values coming from different functions into values in the range [-1, 1]. The reason to do that is to get a visual grasp on the differences between the outcomes of the different algorithms.

In [63]:
@jit(nopython=True)
def scaling(data, range_min, range_max):
    data_values = []
    for tup in data:
        data_values.append(tup[1])
    data_min = min(data_values)
    data_max = max(data_values)
    data_range = data_max - data_min
    new_range = range_max - range_min
    new_data = []
    for tup in data:
        scaled_value = (((tup[1] - data_min) * new_range) / data_range) + range_min
        new_data.append((tup[0], scaled_value))
    return new_data

Define a function to sort lists of tuples by means of the first element, in our case it will be the node_id.

In [82]:
def sort_list_of_tuples(data):
    return sorted(data, key=lambda x: x[0], reverse=False)

Scale values from PageRank.

In [97]:
scaled_pagerank = scaling(pr_list, -1, 1)
scaled_pagerank = sort_list_of_tuples(scaled_pagerank)
pagerank_array = [el[1] for el in scaled_pagerank]
scaled_pagerank[:3]
#pagerank_array

[(0, -0.9997101926143466), (1, -0.9999725867395567), (2, -0.9996765771504587)]

Scale values from Eigenvector.

In [98]:
scaled_eigenvector = scaling(ec_list, -1, 1)
scaled_eigenvector = sort_list_of_tuples(scaled_eigenvector)
eigenvector_array = [el[1] for el in scaled_eigenvector]
scaled_eigenvector[:3]
#eigenvector_array

[(0, -1.0), (1, -1.0), (2, -1.0)]

Scale values from In-degree.

In [99]:
scaled_in_degree = scaling(in_d_list, -1, 1)
scaled_in_degree = sort_list_of_tuples(scaled_in_degree)
in_degree_array = [el[1] for el in scaled_in_degree]
scaled_in_degree[:3] 
#in_degree_array

[(0, -0.9980506822612085), (1, -0.9980506822612085), (2, -0.9980506822612085)]

-----------

In order:
- Read the JSON file containing citations' pairs;
- Create a dictionary called "journal_citations" to store the different citations from journal to journal. The structure of this dictonary will be: "citing_journal_id: list_of_cited_journal_ids" (obviously, in the list we have repetitions of cited journals if articles cites more than one paper of the target journal);
- Populate the network as said above. This is accomplished thanks to a temporary "memo" dict that stores each citations to every target journal and that is initialized every time the source journal changes.
- Populate the "weights" dictionary. Such dictionary will contain the weight of each specific path retrieved and will be used to assign edge attributes to the network.
- article_citations contains pairs of "source article:[list of cited articles]".

<span style="color:red">To retrieve the importance of edges in between journals:
- $\tau_j$ = PageRank centrality score
- $\Phi_J$ = importance of a journal
- $j$ = article
- $n_j$ = # articles in journal J
- $n_{c_{AB}}$ = # of citations from journal A to journal B
$$\Phi_J = \sum_{i=0}^{n_j} \tau_i$$
</br>

$$\omega_{AB} = \dfrac{1}{\Phi_A*n_{AB}}$$

</span>

Compute the importance of each journal.

In [9]:
journal_weights = dict()

for paper in pr_list:
    publication_id = paper[0]
    node_centrality = paper[1]
    if publication_id in reverse_nodes:
        if reverse_nodes[publication_id][2] not in journal_weights:
            journal_weights[reverse_nodes[publication_id][2]] = [0,0]
        journal_weights[reverse_nodes[publication_id][2]][0] += node_centrality
        journal_weights[reverse_nodes[publication_id][2]][1] += 1

Store the importance value of journals into <i>journal_weights</i>.

In [10]:
for journal in journal_weights:
    journal_weights[journal] = journal_weights[journal][0] #/ journal_weights[journal][1]
#journal_weights

Retrieve citations between journals.

In [11]:
journal_citations = dict()
article_citations = dict()

# Iterate over citations_dict to build a journals citations' network
for citation_obj in tqdm(citations_dict):
    source = citation_obj['source']
    target = citation_obj['target']
    if source in nodes:
        if target in nodes:
            source_article = nodes[source][0]
            target_article = nodes[target][0]
            if source_article != target_article:
                if source_article not in article_citations:
                    article_citations[source_article] = list()
                article_citations[source_article].append(target_article)
                source_journal = nodes[source][1]
                target_journal = nodes[target][1]
                if source_journal in journals_dict:
                    if target_journal in journals_dict:
                        jorunal_source_id = journals_dict[source_journal]
                        journal_target_id = journals_dict[target_journal]
                        if jorunal_source_id not in journal_citations:
                            journal_citations[jorunal_source_id] = list()
                        journal_citations[jorunal_source_id].append(journal_target_id)

100%|██████████| 189697/189697 [00:00<00:00, 557547.00it/s]


Build the second network:
- journals_network -> such network will have the different journals as nodes; the edges will be weighted with the reciprocal of the number of citations of articles that goes from journal A to journal B. To be more accurate, it is correct to specify that target nodes without citations won't be considered at all, giving thus the possibility to avoid the definition of a normalization constant (that could have been useful to avoid 0-weigths in paths).


</br>
Also in this case, we will build the undirected version of this network, useful then to analyze its structure.

In [12]:
# Build the citations graph
undirected_journals_network = nx.Graph()
journals_network = nx.DiGraph()

Populate the networks by adding nodes and edges.

In [13]:
weights = dict()

for source_id in journal_citations:
    memo = dict()
    for target_id in journal_citations[source_id]:
        if target_id not in memo:
            memo[target_id] = 0
        memo[target_id] += 1
    for cited_journal in memo:
        weights[(source_id, cited_journal)] = 1/(journal_weights[source_id]*memo[cited_journal])
        undirected_journals_network.add_edge(source_id, cited_journal)
        journals_network.add_edge(source_id, cited_journal)

Save the undirected version of journals' network.

In [14]:
nx.write_gml(undirected_journals_network, "../gml format networks/undirected_journals_network.gml")

Assign edge_attributes to the network, according to the previously computed weights.

In [15]:
nx.set_edge_attributes(journals_network, weights, "relative_weights")

Compute the <i>Betweenness Centrality</i> measure to retrieve the most important journals. The parameter "weight" will contain the weights attributed to the network in the previous snippet.</br>
The "normalized=True" attribute is useful, in this case, because provides a normalization measure for the direct network.

In [16]:
journals_weighted_betweennes = nx.betweenness_centrality(journals_network, k=None, normalized=True, weight='relative_weights', endpoints=False, seed=None)

Print the 100 most influential journals.

In [17]:
journals_influence = sorted(journals_weighted_betweennes.items(), key=lambda item: item[1], reverse=True)
journals_influence[:10]

[(24, 0.2963248760378571),
 (30, 0.07509842353747775),
 (112, 0.05213628249069465),
 (156, 0.04795837894170267),
 (95, 0.027001455513185477),
 (53, 0.02333163908518093),
 (44, 0.021540190209535206),
 (66, 0.019991451747958317),
 (9, 0.019918769751051675),
 (41, 0.01647652165442708)]

Extract the title of the most influential journal.

In [18]:
for journal_title in journals_dict:
    if journals_dict[journal_title] == journals_influence[0][0]:
        most_influential_journal = journal_title
        break
most_influential_journal

'Journal Of Virology'

Count the number of outgoing edges from each article in the dataset.

In [19]:
# Raw count of how many articles each specific article cites
article_citations_tot = dict()

for citation in citations_dict:
    if citation['source'] in nodes:
        source_article_id = nodes[citation['source']][0]
        if citation['target'] in nodes:
            target_article_id = nodes[citation['target']][0]
            if source_article_id != target_article_id:
                if source_article_id not in article_citations_tot:
                    article_citations_tot[source_article_id] = 0
                article_citations_tot[source_article_id] += 1

Build a "journals_sizes" dictionary, containing pairs "journal_id: journal_size", retrieved by the betweenness centrality dictionary computed above.

In [20]:
journal_influences = journals_weighted_betweennes

<span style="color:red">In the following snippet, is given a weight to citations between articles.</br>
Such weight is computed in the following way:
- $n$ is the raw count of out-going citations from a certain article;
- $\alpha$ is the influence of the specific journal containing the citing article (computed with the betweenness centrality measure);
- $\lambda$ is a constant ($\lambda = 0.1$) that is useful to normalize weights equal to $0$;
</br>
Following a flow of information that goes from the source article to the cited one, the relative weight ($\Phi_{ij}$) of the connection between "article $A$" and "article $B$" is computed as follows:</br>

$$\Phi_{AB} = \dfrac{\alpha + \lambda}{n}$$ 
</br>

The idea behind this computation derives from the will to distribute the importance of a certain article between all the articles that it cites in an equal way. Furthermore, higher the number of cited articles -> smaller the importance passed to each one of them.</span>


In [21]:
paper_weights = dict()

for paper in pr_list:
    paper_id = paper[0]
    paper_weight = paper[1]
    paper_weights[paper_id] = paper_weight
#paper_weights

Build a new network, that is the citation network of publications contained within the most influential journal.

In [22]:
publications_network = nx.DiGraph()

Add edges to the network and save the weights of these connections.

In [23]:
# articles_weights contains pairs of "(tuple source-target): weight of the connection"
articles_weights = dict()

for citation in tqdm(citations_dict):
    found_all = False
    if citation['source'] in nodes:
        source_article_id = nodes[citation['source']][0]
        source_journal = nodes[citation['source']][1]
        if source_journal in journals_dict:
            source_journal_id = journals_dict[source_journal]
            if source_journal_id in journal_influences:
                if source_article_id in article_citations_tot:
                    article_distributed_weight = ((journal_influences[source_journal_id]/article_citations_tot[source_article_id])*paper_weights[source_article_id])
                    found_all = True
    if found_all:
        if source_article_id in article_citations:
            for cited_article_id in article_citations[source_article_id]:
                if source_article_id != cited_article_id:
                    publications_network.add_edge(source_article_id, cited_article_id)
                    articles_weights[(source_article_id, cited_article_id)] = article_distributed_weight

100%|██████████| 189697/189697 [00:07<00:00, 24839.74it/s]


Set the weights of edges within the most influential journal citations network.

In [24]:
nx.set_edge_attributes(publications_network, articles_weights, "relative_new_nodes_weights")

Finally, compute the <i>Eigenvector Centrality</i> measure in order to find which publications can be identified as key publications within the reference context.

In [102]:
key_papers = nx.eigenvector_centrality(publications_network, max_iter=1000, weight='relative_new_nodes_weights')
key_papers_list = sorted(key_papers.items(), key=lambda item: item[1], reverse=True)

In [111]:
three_key_papers = sorted(key_papers.items(), key=lambda item: item[1], reverse=True)[:10]
three_key_papers

[(25837, 0.004485515506349722),
 (13497, 0.004485417976788674),
 (17440, 0.00448537379024872),
 (39264, 0.004485353130516605),
 (25492, 0.004485224651984496),
 (34785, 0.004485221858837908),
 (2883, 0.0044852197613752835),
 (21306, 0.004485206673108011),
 (26129, 0.004485197539569766),
 (26347, 0.004485112422437013)]

Compare these 3 articles with the 3 articles found at the beginning (that is, before assignign weights on the basis of the provenance's journals), in order to see whether our process led to different results.

In [113]:
# old key papers
i=0
print("    Old key papers", "      ---------     " "New key papers")
for el in pr_list[:10]:
    print(el, "--", three_key_papers[i])
    i+=1

    Old key papers       ---------     New key papers
(39264, 0.003958045367325669) -- (25837, 0.004485515506349722)
(17440, 0.0034456482660907505) -- (13497, 0.004485417976788674)
(21306, 0.0031866534313306017) -- (17440, 0.00448537379024872)
(25837, 0.002695082402101789) -- (39264, 0.004485353130516605)
(12204, 0.002360879446452903) -- (25492, 0.004485224651984496)
(26129, 0.002305909452775167) -- (34785, 0.004485221858837908)
(12343, 0.002106388438053257) -- (2883, 0.0044852197613752835)
(7812, 0.0020691617187695984) -- (21306, 0.004485206673108011)
(40899, 0.002012513218906165) -- (26129, 0.004485197539569766)
(8523, 0.001937907687581662) -- (26347, 0.004485112422437013)


Compare the three articles we have retrieved now, with the three extracted at the beginning by means of the Eigenvector centrality.

In [114]:
# old key papers
i=0
print("    Old key papers", "      ---------     " "New key papers")
for el in ec_list[:10]:
    print(el, "--", three_key_papers[i])
    i+=1

    Old key papers       ---------     New key papers
(34948, 0.22629370522315476) -- (25837, 0.004485515506349722)
(26230, 0.19000624260331436) -- (13497, 0.004485417976788674)
(3907, 0.18100263575061926) -- (17440, 0.00448537379024872)
(11156, 0.17289612871374369) -- (39264, 0.004485353130516605)
(23638, 0.16595863496464636) -- (25492, 0.004485224651984496)
(17015, 0.1644623028002411) -- (34785, 0.004485221858837908)
(4553, 0.1492088280751363) -- (2883, 0.0044852197613752835)
(38019, 0.14674225337814517) -- (21306, 0.004485206673108011)
(22743, 0.1426157058405683) -- (26129, 0.004485197539569766)
(37554, 0.1410142599703751) -- (26347, 0.004485112422437013)


The same is done for the first three articles extracted with the in degree count.

In [115]:
i=0
print("    Old key papers", "      ---------     " "New key papers")
for el in in_d_list[:10]:
    print(el, "--", three_key_papers[i])
    i+=1

    Old key papers       ---------     New key papers
(39264, 1026) -- (25837, 0.004485515506349722)
(25837, 1014) -- (13497, 0.004485417976788674)
(17440, 985) -- (17440, 0.00448537379024872)
(21306, 794) -- (39264, 0.004485353130516605)
(12204, 726) -- (25492, 0.004485224651984496)
(8523, 623) -- (34785, 0.004485221858837908)
(7573, 623) -- (2883, 0.0044852197613752835)
(7812, 565) -- (21306, 0.004485206673108011)
(26129, 484) -- (26129, 0.004485197539569766)
(4212, 451) -- (26347, 0.004485112422437013)


So, there are differences with respect to the eigenvector, the pagerank and the in-degree.

At this point, we study how the final output of our workflow correlates with the yet known measures.

In [121]:
scaled_key_papers = scaling(key_papers_list, -1, 1)
scaled_key_papers = sort_list_of_tuples(scaled_key_papers)
key_papers_array = [el[1] for el in scaled_key_papers]
scaled_key_papers[17440]
#key_papers_array

(17440, 0.6276473389613471)

In [109]:
r, p = scipy.stats.kendalltau(key_papers_array, eigenvector_array)
print("Correlation coefficient between the Eigenvector measure and the citation count:", r)
r, p = scipy.stats.kendalltau(key_papers_array, pagerank_array)
print("Correlation coefficient between the PageRank measure and the citation count:", r)
r, p = scipy.stats.kendalltau(key_papers_array, in_degree_array)
print("Correlation coefficient between our algorithm and the citation count:", r)

Correlation coefficient between the Eigenvector measure and the citation count: 0.49966997156315573
Correlation coefficient between the PageRank measure and the citation count: 0.7095266980700551
Correlation coefficient between our algorithm and the citation count: 0.7386009881982776


------

Finally retrieve metadata about these new key papers.

In [28]:
for paper in metadata_dict:
    if paper['node_id'] == three_key_papers[0][0]:
        key_paper_1 = paper
    if paper['node_id'] == three_key_papers[1][0]:
        key_paper_2 = paper
    if paper['node_id'] == three_key_papers[2][0]:
        key_paper_3 = paper

In [29]:
key_paper_1

{'id': '10.1056/nejmoa1211721',
 'author': 'Zaki, Van Boheemen, Bestebroer, Osterhaus, Fouchier',
 'year': '2012',
 'title': 'Isolation Of A Novel Coronavirus From A Man With Pneumonia In Saudi Arabia',
 'source_title': 'New England Journal Of Medicine',
 'node_id': 25837}

In [30]:
key_paper_2

{'id': '10.1371/journal.ppat.1001258',
 'author': 'Huang, Bailey, Weyer, Radoshitzky, Becker, Chiang, Brass, Ahmed, Chi, Dong, Longobardi, Boltz, Kuhn, Elledge, Bavari, Denison, Choe, Farzan',
 'year': '2011',
 'title': 'Distinct Patterns Of Ifitm-Mediated Restriction Of Filoviruses, Sars Coronavirus, And Influenza A Virus',
 'source_title': 'Plos Pathogens',
 'node_id': 13497}

In [31]:
key_paper_3

{'id': '10.1056/nejmoa030747',
 'author': 'Drosten, Günther, Preiser, Van Der Werf, Brodt, Becker, Rabenau, Panning, Kolesnikova, Fouchier, Berger, Burguière, Cinatl, Eickmann, Escriou, Grywna, Kramme, Manuguerra, Müller, Rickerts, Stürmer, Vieth, Klenk, Osterhaus, Schmitz, Doerr',
 'year': '2003',
 'title': 'Identification Of A Novel Coronavirus In Patients With Severe Acute Respiratory Syndrome',
 'source_title': 'New England Journal Of Medicine',
 'node_id': 17440}

Save the networks build during the entire process.

In [32]:
nx.write_gml(papers_network, "../gml format networks/directed_first_papers_network.gml")
nx.write_gml(journals_network, "../gml format networks/directed_journals_network.gml")
nx.write_gml(publications_network, "../gml format networks/directed_final_papers_network.gml")