General analysis on the knowledge graph, and prep for visualising in Gephi

In [1]:
#loading graph into RDFLib

from rdflib import Graph
from rdflib.extras.external_graph_libs import rdflib_to_networkx_digraph
import networkx as nx
from networkx import Graph as NXGraph
import matplotlib.pyplot as plt
import statistics
import collections


filename = "cleaned_maingraph.nt"
rg = Graph()
rg.parse(filename, format='nt')
print("rdflib Graph loaded successfully with {} triples".format(len(rg)))

rdflib Graph loaded successfully with 3553957 triples


In [2]:
# Conversion of rdflib.Graph to networkx.Graph
# G = rdflib_to_networkx_digraph(rg)
# print("networkx Graph loaded successfully")

networkx Graph loaded successfully


In [None]:
# Analysis 1

def mean(numbers):
    return float(sum(numbers)) / max(len(numbers), 1)

def number_of_pendants(G):
    """
    Equals the number of nodes with degree 1
    """
    pendants = 0
    for u in G:
        if G.degree[u] == 1:
            pendants += 1
    return pendants

# Network size
print("The network has {} nodes and {} edges".format(G.number_of_nodes(), G.number_of_edges()))
print()
print("The network has {} pendants".format(number_of_pendants(G))) #number of nodes with only one link
print()
print("The network density is {}".format(nx.density(G)))
print()

In [5]:
#query to check size of each layer for each article
from rdflib.extras.external_graph_libs import rdflib_to_networkx_digraph

query = """
    PREFIX PMC: <https://pubmed.ncbi.nlm.nih.gov/>
    PREFIX EDAM: <http://edamontology.org/>
    PREFIX RDF: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX DCT: <http://purl.org/dc/terms/>
    PREFIX BIOLINK: <https://w3id.org/biolink/vocab/>
    PREFIX ENSEMBL: <http://identifiers.org/ensembl/>
    PREFIX NCBIGENE: <http://identifiers.org/ncbigene/>
                
    SELECT ?s ?p ?o
    WHERE {
        VALUES ?root {PMC:28184278 PMC:36590292 PMC:33243861 PMC:26186191}
        {
            ?root ?d ?a .
            BIND(?root AS ?s)
            BIND(?d AS ?p)
            BIND(?a AS ?o)
        }
        UNION
        {   
            ?root ?d ?a .
            ?a ?e ?b .
            BIND(?a AS ?s)
            BIND(?e AS ?p)
            BIND(?b AS ?o)
        }
        UNION
        {
            ?root ?d ?a .
            ?a ?e ?b .
            ?b ?f ?c .
            BIND(?b AS ?s)
            BIND(?f AS ?p)
            BIND(?c AS ?o)
       }
    }
    LIMIT 20000
"""

#return query
subg = rg.query(query)
print(len(subg))

10732


In [None]:
#Actual triple sizes per article (retrieved from above):
#PMID               Triples to article  +   Triples for rows of data    +   values for each row
#PMID:26186191  -   6 + 2203 + 6609
#PMID:28184278  -   6 + 39 + 117
#PMID:30545856  -   9 + 136353 + 408942
#PMID:31097668  -   11 + 4684 + 12981
#PMID:33243861  -   6 + 187 + 561
#PMID:33262327  -   7 + 2370 + 7110
#PMID:34535545  -   7 + 2597 + 7791
#PMID:35962193  -   9 + 93222 + 232603
#PMID:36323788  -   10 + 153575 + 460670
#PMID:36590292  -   6 + 248 + 744
#PMID:36862688  -   9 + 72374 + 217122
#PMID:37041460  -   13 + 3900 + 11700
#PMID:37704762  -   6 + 429307 + 1287921

In [6]:
#To convert to nxgraph (not currently needed)
#G = nx.DiGraph()
#for s, p, o in subg:
#    G.add_edge(str(s), str(o), relation=str(p))
#
#nx.write_gexf(G, "sub4.gexf")

In [6]:
#subgraph creation to visualise 2 papers - creates Gephi file
# import rdflib

query = """
PREFIX PMC: <https://pubmed.ncbi.nlm.nih.gov/>
PREFIX EDAM: <http://edamontology.org/>
PREFIX RDF: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX DCT: <http://purl.org/dc/terms/>
PREFIX BIOLINK: <https://w3id.org/biolink/vocab/>
PREFIX ENSEMBL: <http://identifiers.org/ensembl/>
PREFIX NCBIGENE: <http://identifiers.org/ncbigene/>
            
SELECT ?s ?p ?o
WHERE {
    VALUES ?root { PMC:32460837 PMC:33262327 }
    ?root ((!<>)*)* ?s .
    ?s ?p ?o .
}
"""

subg = rg.query(query)
subgraph = rdflib.Graph()
for row in subg:
    subgraph.add(row)

G = nx.DiGraph()
for s, p, o in subgraph:
    G.add_edge(str(s), str(o), relation=str(p))

#write out to gexf file that can be viewed and manipulated in Gephi
nx.write_gexf(G, "subgraph2.gexf")