In [1]:

from rdflib import Graph
from rdflib.extras.external_graph_libs import rdflib_to_networkx_digraph
import networkx as nx
from networkx import Graph as NXGraph
import matplotlib.pyplot as plt
import statistics
import collections

# RDF graph loading
filename = "cleaned_maingraph.nt"
rg = Graph()
rg.parse(filename, format='nt')
print("rdflib Graph loaded successfully with {} triples".format(len(rg)))

rdflib Graph loaded successfully with 3556103 triples


In [2]:
# Conversion of rdflib.Graph to networkx.Graph
G = rdflib_to_networkx_digraph(rg)
print("networkx Graph loaded successfully")

networkx Graph loaded successfully


In [3]:
# Analysis 1

def mean(numbers):
    return float(sum(numbers)) / max(len(numbers), 1)

def number_of_pendants(g):
    """
    Equals the number of nodes with degree 1
    """
    pendants = 0
    for u in g:
        if g.degree[u] == 1:
            pendants += 1
    return pendants

# Network size
print("NETWORK SIZE")
print("============")
print("The network has {} nodes and {} edges".format(G.number_of_nodes(), G.number_of_edges()))
print()

# Network size
print("PENDANTS")
print("============")
print("The network has {} pendants".format(number_of_pendants(G))) #number of nodes with only one link
print()

# Density
print("DENSITY")
print("============")
print("The network density is {}".format(nx.density(G)))
print()

NETWORK SIZE
The network has 2297357 nodes and 3556074 edges

PENDANTS
The network has 1234725 pendants

DENSITY
The network density is 6.737736193187597e-07



In [6]:
# Subgraph construction 

query = """
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
SELECT ?s ?p ?o
WHERE {  
    ?s ?p ?o .
    FILTER (
        datatype(?o) != xsd:double
        && !REGEX(STR(?o), "down|up", "i")
    )
}
"""
subg = rg.query(query)
print(len(subg))
sg = rdflib_to_networkx_digraph(subg)
print(len(sg))
print("networkx SubGraph loaded successfully")


222492
352336
networkx SubGraph loaded successfully


In [None]:
# Analysis 2
import heapq

def mean(numbers):
    return float(sum(numbers)) / max(len(numbers), 1)

def number_of_pendants(sg):
    """
    Equals the number of nodes with degree 1
    """
    pendants = 0
    for u in sg:
        if sg.degree[u] == 1:
            pendants += 1
    return pendants

# Network size
print("NETWORK SIZE")
print("============")
print("The network has {} nodes and {} edges".format(sg.number_of_nodes(), sg.number_of_edges()))
print()

# Network size
print("PENDANTS")
print("============")
print("The network has {} pendants".format(number_of_pendants(sg))) #number of nodes with only one link
print()

# Density
print("DENSITY")
print("============")
print("The network density is {}".format(nx.density(sg)))
print()

def histogram(l):
    degree_sequence = sorted([d for n, d in list(l.items())], reverse=True)
    degreeCount = collections.Counter(degree_sequence)
    deg, cnt = zip(*degreeCount.items())
    print(deg, cnt)
    
    fig, ax = plt.subplots()
    plt.bar(deg, cnt, width=0.80, color='b')

    plt.title("Histogram")
    plt.ylabel("Count")
    plt.xlabel("Value")
    ax.set_xticks([d + 0.4 for d in deg])
    ax.set_xticklabels(deg)

    plt.show()

# Degree centrality -- mean and stdev
dc = nx.degree_centrality(sg)
degrees = []
for k, v in dc.items():
    degrees.append(v)
    
print("DEGREE CENTRALITY") # number of connections a node has
print("=================")
print("The mean degree centrality is {}, with stdev {}".format(mean(degrees), statistics.stdev(degrees)))
print("The maximum nodes are {}, with value {}".format(heapq.nlargest(5, dc, key=dc.get), max(dc.values())))
print("The minimum node is {}, with value {}".format(min(dc, key=dc.get), min(dc.values())))
#histogram(dc)
print()

# Eigenvector centrality -- mean and stdev
ec = nx.eigenvector_centrality(sg)
degrees = []
for k,v in ec.items():
    degrees.append(v)
    
print("EIGENVECTOR CENTRALITY")
print("======================")
print("The mean network eigenvector centrality is {}, with stdev {}".format(mean(degrees), statistics.stdev(degrees)))
print("The maximum nodes are {}, with value {}".format(heapq.nlargest(5, ec, key=ec.get), max(ec.values())))
print("The minimum node is {}, with value {}".format(min(ec, key=ec.get), min(ec.values())))
#histogram(ec)
print()

# Betweenness centrality -- mean and stdev
bc = nx.betweenness_centrality(sg)   # extent to which a node connects other nodes that are not otherwise connected.
degrees = []
for k,v in bc.items():
    degrees.append(v)

print("BETWEENNESS CENTRALITY")
print("======================")
print("The mean betwenness centrality is {}, with stdev {}".format(mean(degrees), statistics.stdev(degrees)))
print("The maximum nodes are {}, with value {}".format(heapq.nlargest(5, bc, key=bc.get), max(bc.values())))
print("The minimum node is {}, with value {}".format(min(bc, key=bc.get), min(bc.values())))
#histogram(bc)
print()


# Connected components
cc = list(nx.connected_components(sg))
print("CONNECTED COMPONENTS")
print("====================")
print("The graph has {} connected components".format(len(cc)))
for i,c in enumerate(cc):
    print("Connected component {} has {} nodes".format(i,len(c)))
print()

# Clusters
#cl = nx.clustering(sg)
#print("CLUSTERS")
#print("========")
#print("The graph has {} clusters".format(len(cl)))
#for i,c in enumerate(cl):
#    print("Cluster {} has {} nodes".format(i,len(c)))
#print()
    


In [65]:
#subgraph creation to demonstrate 2 papers:
from rdflib.extras.external_graph_libs import rdflib_to_networkx_digraph

query = """
    PREFIX PMC: <https://pubmed.ncbi.nlm.nih.gov/>
    PREFIX EDAM: <http://edamontology.org/>
    PREFIX RDF: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX DCT: <http://purl.org/dc/terms/>
    PREFIX BIOLINK: <https://w3id.org/biolink/vocab/>
    PREFIX ENSEMBL: <http://identifiers.org/ensembl/>
    PREFIX NCBIGENE: <http://identifiers.org/ncbigene/>
                
    SELECT ?ca ?p ?o
    WHERE {
        VALUES ?root {PMC:37704762}
        ?root ?q ?a .
        ?a ?b ?c .
        ?c ?p ?o
    }
    LIMIT 2000000
"""
#PMID               Triples to article  +   Triples for rows of data    +   values for each row
#PMID:26186191  -   6 + 2203 + 6609
#PMID:28184278  -   6 + 39 + 117
#PMID:30545856  -   9 + 136353 + 408942
#PMID:31097668  -   11 + 4684 + 12981
#PMID:33243861  -   6 + 187 + 561
#PMID:33262327  -   7 + 2370 + 7110
#PMID:34535545  -   7 + 2597 + 7791
#PMID:35962193  -   9 + 93222 + 232603
#PMID:36323788  -   10 + 153575 + 460670
#PMID:36590292  -   6 + 248 + 744
#PMID:36862688  -   9 + 72374 + 217122
#PMID:37041460  -   13 + 3900 + 11700
#PMID:37704762  -   6 + 429307 + 1287921

subg = rg.query(query)
print(len(subg))
#sg = rdflib_to_networkx_digraph(subg)
#print(len(sg))
#print("networkx SubGraph loaded successfully")

1287921


In [None]:
?root ?q ?a .
?a ?b ?c .
?c ?p ?o

In [6]:
import rdflib

query = """
PREFIX PMC: <https://pubmed.ncbi.nlm.nih.gov/>
PREFIX EDAM: <http://edamontology.org/>
PREFIX RDF: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX DCT: <http://purl.org/dc/terms/>
PREFIX BIOLINK: <https://w3id.org/biolink/vocab/>
PREFIX ENSEMBL: <http://identifiers.org/ensembl/>
PREFIX NCBIGENE: <http://identifiers.org/ncbigene/>
            
SELECT ?s ?p ?o
WHERE {
    VALUES ?root { PMC:32460837 PMC:33262327 }
    ?root ((!<>)*)* ?s .
    ?s ?p ?o .
}
"""

# Execute the query
subg = rg.query(query)

# Create a new graph for the subgraph
subgraph = rdflib.Graph()

# Add the results to the subgraph
for row in subg:
    subgraph.add(row)

G = nx.DiGraph()
for s, p, o in subgraph:
    G.add_edge(str(s), str(o), relation=str(p))

nx.write_gexf(G, "subgraph2.gexf")
