In [50]:
import networkx as nx
import matplotlib.pyplot as plt
import json
from tqdm import tqdm
import community
from community import community_louvain

In [51]:
# Read metadata JSON file in order to build a dictionary
metadata = open("metadata.json")
metadata_dict = json.load(metadata)

# Create a dict of pairs "doi: (node_id, year)"
nodes = dict()

# Add a number as unique identifier of each one of the nodes
i = 0
for obj in metadata_dict:
    obj["node_id"] = i
    nodes[obj['id']] = (obj['node_id'], obj['source_title'])
    i+=1

In [52]:
journals_dict = {}

j = 0
for paper in metadata_dict:
    if paper['source_title'] not in journals_dict:
        journals_dict[paper['source_title']] = j
        j+=1
print(j)

5661


In [53]:
# Build the citations graph
citations_graph = nx.DiGraph()

In [55]:
# Read citations JSON file in order to build a dictionary
citations = open('citations.json')
citations_dict = json.load(citations)
journal_citations = dict()

# Iterate over citations_dict to build nodes and edges
for citation_obj in tqdm(citations_dict):
    source = citation_obj['source']
    target = citation_obj['target']
    if source in nodes and target in nodes:
        source_journal = nodes[source][1]
        target_journal = nodes[target][1]
        if source_journal in journals_dict:
            if target_journal in journals_dict:
                source_id = journals_dict[source_journal]
                target_id = journals_dict[target_journal]
                if source_id not in journal_citations:
                    journal_citations[source_id] = list()
                journal_citations[source_id].append(target_id)

citations_count = dict()
for source_id in journal_citations:
    memo = dict()
    for target_id in journal_citations[source_id]:
        if target_id not in memo:
            memo[target_id] = 0
        memo[target_id] += 1
    for cited_journal in memo:
        citations_graph.add_edge(source_id, cited_journal, weight=1/memo[cited_journal])

100%|██████████| 189697/189697 [00:00<00:00, 633825.52it/s]


In [56]:
journal_citations

{1654: [95,
  95,
  1398,
  308,
  308,
  308,
  308,
  519,
  135,
  52,
  375,
  375,
  375,
  519,
  308,
  135,
  268,
  268,
  268,
  268,
  268,
  268,
  1487,
  275,
  275,
  66,
  354,
  24,
  24,
  24,
  346,
  855,
  199,
  199,
  199,
  199,
  199,
  199,
  199,
  199,
  199,
  199,
  199,
  199,
  199,
  199,
  199,
  199,
  199,
  199,
  199,
  199,
  275,
  52,
  30,
  30,
  95,
  354,
  519,
  519,
  199,
  180,
  479,
  65,
  308,
  180,
  95,
  116,
  24,
  95,
  95,
  143,
  24,
  24,
  24,
  199,
  308,
  1487,
  268,
  268,
  855,
  24,
  24,
  129,
  375,
  1262,
  30,
  199,
  199,
  539,
  959,
  180,
  35,
  30,
  29,
  135,
  199,
  66,
  199,
  24,
  24,
  79,
  68,
  24,
  95,
  95,
  135,
  275,
  275,
  44,
  53,
  29,
  29,
  199,
  199,
  79,
  275,
  43,
  41,
  24,
  179,
  83,
  199,
  135,
  24,
  750,
  308,
  66,
  199,
  4,
  95,
  65,
  479,
  308,
  216,
  539,
  129,
  53,
  29,
  29,
  948,
  179,
  179,
  24,
  24,
  24,
  24,
  24,
  24,
  9,

In [70]:
def in_degree_centrality(G):
    if len(G) <= 1:
        return {n: 1 for n in G}

    s = 1.0 / (len(G) - 1.0)
    centrality = {n: d * s for n, d in G.in_degree()}
    return centrality
s = in_degree_centrality(citations_graph)
s = sorted(s.items(), key=lambda item: item[1], reverse=True)[:100]
for el in journals_dict:
    if journals_dict[el] == s[0][0]:
        most = el
        print(el)

Journal Of Virology


In [80]:
l = []
for el in metadata_dict:
    if el['source_title'] == most:
        l.append(el['id'])
# questi vanno salvati, poi devo vedere quanto si citano a vicenda e fare l'eigenvector di questi, poi vedere quanto dista dal risultato finale.

In [82]:
cit_tuples = list()
for el in citations_dict:
    cit_tuples.append((el['source'], el['target']))

journal_network_cit = nx.DiGraph()

for el in cit_tuples:
    if el[0] in l and el[1] in l:
        journal_network_cit.add_edge(el[0], el[1])

In [90]:
centrality = nx.eigenvector_centrality_numpy(journal_network_cit)
list_centrality = sorted(centrality.items(), key=lambda item: item[1], reverse=True)[:100]
list_centrality

[('10.1128/jvi.01118-06', 0.3429692256245491),
 ('10.1128/jvi.00239-10', 0.34296922545277064),
 ('10.1128/jvi.00676-08', 0.34296922545277053),
 ('10.1128/jvi.02205-08', 0.34296921840985045),
 ('10.1128/jvi.00560-06', 0.3249182230391743),
 ('10.1128/jvi.00415-08', 0.25271416648925277),
 ('10.1128/jvi.77.16.8801-8811.2003', 0.19856113992337907),
 ('10.1128/jvi.79.24.15511-15524.2005', 0.19856112575165075),
 ('10.1128/jvi.02062-10', 0.180510123853396),
 ('10.1128/jvi.78.19.10628-10635.2004', 0.18051012316627732),
 ('10.1128/jvi.01542-10', 0.1805101228227245),
 ('10.1128/jvi.02041-07', 0.1805101164669144),
 ('10.1128/jvi.01412-08', 0.14440808819195664),
 ('10.1128/jvi.78.12.6134-6142.2004', 0.1263570925636134),
 ('10.1128/jvi.01248-09', 0.1263570917047207),
 ('10.1128/jvi.00140-10', 0.10830607271449626),
 ('10.1128/jvi.02744-05', 0.09025507167512734),
 ('10.1128/jvi.80.9.4211-4219.2006', 0.0902550684113367),
 ('10.1128/jvi.02232-10', 0.09025506265675787),
 ('10.1128/jvi.78.11.5642-5650.200

In [58]:
articles_citations = nx.DiGraph()

for citation_obj in tqdm(citations_dict):
    source = citation_obj['source']
    target = citation_obj['target']
    if source in nodes and target in nodes:
        articles_citations.add_edge(source, target)

100%|██████████| 189697/189697 [00:00<00:00, 439244.41it/s]


In [60]:
pr = nx.pagerank(articles_citations, alpha=0.9)

In [92]:
prs = sorted(pr.items(), key=lambda item: item[1], reverse=True)[:100]
nodes[prs[0][0]]
prs
for el in prs:
    if el[0] == "10.1128/jvi.01118-06":
        print("trovato")

In [59]:
bet_centrality = nx.betweenness_centrality(citations_graph, normalized = True,
											endpoints = False)
#print(bet_centrality)

KeyboardInterrupt: 

In [None]:
for el in bet_centrality:
    if bet_centrality[el] != 0.0:
        print(el, bet_centrality[el])
        

1654 0.00010944855583481178
95 0.019339108002971624
268 0.0011609593835111582
1487 0.0008756149494939489
275 0.00028169878714172245
66 0.01778873467532449
354 0.0019266106562204355
24 0.07327742146605086
346 0.0002838466198532562
855 6.258494373310161e-06
199 0.002014732954975115
30 0.018443343752699688
180 0.0015126509759647458
65 0.0012287009853325737
116 0.0005486750354392703
143 0.001999062164617881
129 0.014285052255646054
1262 0.0003217663789265654
539 0.007545466068934214
35 0.0007850994219562083
29 0.01003877498872614
79 0.008070529220760693
68 0.001222168267449444
44 0.01656592053692261
53 0.025925579204212013
41 0.021730446271925843
179 0.005334095501829079
750 0.0009788541115268866
4 0.0007504234296211812
948 0.0020768761466819297
9 0.022215536673136835
112 0.022402601294846176
58 0.0013658326766641687
344 8.426409222413502e-06
1590 2.601732420770483e-08
80 0.005964560229150045
253 0.0009546272705679133
958 0.0009456940696387093
1308 0.00010397251687589968
613 3.089280911831

In [None]:
bet_centrality_list = sorted(bet_centrality.items(), key=lambda item: item[1], reverse=True)

In [None]:
bet_centrality_list


[(24, 0.07327742146605086),
 (156, 0.030148203256410412),
 (53, 0.025925579204212013),
 (112, 0.022402601294846176),
 (9, 0.022215536673136835),
 (41, 0.021730446271925843),
 (95, 0.019339108002971624),
 (30, 0.018443343752699688),
 (66, 0.01778873467532449),
 (44, 0.01656592053692261),
 (132, 0.015709753316275277),
 (129, 0.014285052255646054),
 (74, 0.011186973323754498),
 (319, 0.010319850798066623),
 (265, 0.0100537580604477),
 (29, 0.01003877498872614),
 (131, 0.009708430575271225),
 (452, 0.009177661360613956),
 (79, 0.008070529220760693),
 (539, 0.007545466068934214),
 (100, 0.007304778715158644),
 (1768, 0.00628422684524688),
 (80, 0.005964560229150045),
 (457, 0.005740211883934308),
 (179, 0.005334095501829079),
 (379, 0.005069229772520852),
 (3610, 0.004942261393976307),
 (138, 0.004839616817892889),
 (165, 0.004187802662424581),
 (40, 0.004054447019408499),
 (137, 0.003572972566335439),
 (464, 0.0030697244628494166),
 (414, 0.00292197017325829),
 (27, 0.002841781270721087),


In [None]:
for obj in metadata_dict:
    if obj['node_id'] == 39264:
        print(obj['id'])

10.1056/nejmoa030781


In [None]:
comm = community.greedy_modularity_communities(citations_graph)

AttributeError: module 'community' has no attribute 'greedy_modularity_communities'

In [None]:
partition = community_louvain.best_partition(citations_graph)

TypeError: Bad graph type, use only non directed graph

In [None]:
nx.write_gml(citations_graph, "test.gml")

In [None]:
nx.draw(citations_graph)
plt.savefig('graph.png') 