# WikiPathways SPARQL queries
This notebook can be used to rapidly perform SPARQL queries on a local Virtuoso SPARQL endpoint with WikiPathways RDF loaded.

## Imports

In [1]:
import sys

!{sys.executable} -m pip install --upgrade pip 
!{sys.executable} -m pip install watermark

try:
    from SPARQLWrapper import SPARQLWrapper, JSON
except ImportError:
    !{sys.executable} -m pip install sparqlwrapper
    from SPARQLWrapper import SPARQLWrapper, JSON

try:
    import pandas as pd
except ImportError:
    !{sys.executable} -m pip install pandas
    import pandas as pd

try:
    import time
except ImportError:
    !{sys.executable} -m pip install time
    import time
    
pd.set_option('display.max_colwidth', -1)

Requirement already up-to-date: pip in /home/marvin.martens/anaconda3/lib/python3.7/site-packages (20.2.3)


## Define the SPARQL endpoint URL

In [2]:
SPARQL = SPARQLWrapper("http://localhost:8890/sparql/")

SPARQL.setReturnFormat(JSON) 

## Run the queries
The block of code below performs all relevant SPARQL queries and stores the counts data in a dataframe.

In [3]:
start = time.time()

#Pathways

data = pd.DataFrame(columns=['item','number'])
sparqlquery = '''SELECT DISTINCT count(distinct ?pathway) as ?pathwayCount
WHERE {
  ?pathway a wp:Pathway;
  dc:title ?title ;
  wp:organism ?organism ;
  wp:organismName "Homo sapiens"^^xsd:string .
}'''
SPARQL.setQuery(sparqlquery)
results = SPARQL.query().convert()

for result in results["results"]["bindings"]:
    data = data.append({
            'number' : result["pathwayCount"]["value"],
            'item' : 'allPathways',
        }, ignore_index=True)


#Genes - ALL

sparqlquery = '''SELECT DISTINCT count(?ensId) as ?Ensembl 
WHERE {
  ?dataNodes a wp:DataNode ;
  wp:bdbEnsembl ?ensId ;
  dcterms:isPartOf ?pw.
  ?pw a wp:Pathway;
  dc:title ?title ;
  wp:organism ?organism ;
  wp:organismName "Homo sapiens"^^xsd:string .
}'''
SPARQL.setQuery(sparqlquery)
results = SPARQL.query().convert()

for result in results["results"]["bindings"]:
    data = data.append({
            'number' : result["Ensembl"]["value"],
            'item' : 'allGenes',
        }, ignore_index=True)

#Genes - Unique

sparqlquery = '''SELECT DISTINCT count(distinct ?ensId) as ?Ensembl 
WHERE {
  ?dataNodes a wp:DataNode ;
  wp:bdbEnsembl ?ensId ;
  dcterms:isPartOf ?pw.
  ?pw a wp:Pathway;
  dc:title ?title ;
  wp:organism ?organism ;
  wp:organismName "Homo sapiens"^^xsd:string .
}'''
SPARQL.setQuery(sparqlquery)
results = SPARQL.query().convert()

for result in results["results"]["bindings"]:
    data = data.append({
            'number' : result["Ensembl"]["value"],
            'item' : 'UniqueGenes',
        }, ignore_index=True)


#PubMed - ALL

sparqlquery = '''SELECT  count(?pubmed) as ?pubmedCount
WHERE {
  ?pubmed a wp:PublicationReference ;
  dcterms:isPartOf ?pathway.
  ?pathway a wp:Pathway;
  wp:organism ?organism ;
  wp:organismName "Homo sapiens"^^xsd:string .
}'''
SPARQL.setQuery(sparqlquery)
results = SPARQL.query().convert()

for result in results["results"]["bindings"]:
    data = data.append({
            'number' : result["pubmedCount"]["value"],
            'item' : 'allPublications',
        }, ignore_index=True)


#PubMed - Unique

sparqlquery = '''SELECT count(distinct ?pubmed) as ?pubmedCount 
WHERE {
  ?pubmed a wp:PublicationReference ;
  dcterms:isPartOf ?pathway.
  ?pathway a wp:Pathway;
  wp:organism ?organism ;
  wp:organismName "Homo sapiens"^^xsd:string .
}'''
SPARQL.setQuery(sparqlquery)
results = SPARQL.query().convert()

for result in results["results"]["bindings"]:
    data = data.append({
            'number' : result["pubmedCount"]["value"],
            'item' : 'UniquePublications',
        }, ignore_index=True)


#Metabolites - ALL

sparqlquery = '''SELECT count(?metabolite) as ?metaboliteCount 
WHERE {
  ?metabolite a wp:Metabolite ;
  dcterms:isPartOf ?pw.
  ?pw a wp:Pathway;
  dc:title ?title ;
  wp:organism ?organism ;
  wp:organismName "Homo sapiens"^^xsd:string .
}'''
SPARQL.setQuery(sparqlquery)
results = SPARQL.query().convert()

for result in results["results"]["bindings"]:
    x = result["metaboliteCount"]["value"]

sparqlquery = '''SELECT count(?metabolite) as ?metaboliteCount 
WHERE {
  ?metabolite a wp:Metabolite ;
  dcterms:isPartOf ?pw.
  ?pw a wp:Pathway;
  dc:title ?title ;
  wp:organism ?organism ;
  wp:organismName "Homo sapiens"^^xsd:string .
  
  OPTIONAL { ?metabolite wp:bdbWikidata ?wikidata . } 
  OPTIONAL { ?metabolite wp:bdbChEBI ?chebi . }  
  OPTIONAL { ?metabolite wp:bdbHmdb ?hmdb . }  
  OPTIONAL { ?metabolite wp:bdbChemspider ?chemspider . }  
  OPTIONAL { ?metabolite wp:bdbPubChem ?pubchem . }  

  FILTER (!BOUND(?wikidata))  
  FILTER (!BOUND(?chebi))  
  FILTER (!BOUND(?hmdb))  
  FILTER (!BOUND(?chemspider)) 
  FILTER (!BOUND(?pubchem)) 
}'''
SPARQL.setQuery(sparqlquery)
results = SPARQL.query().convert()

for result in results["results"]["bindings"]:
    y = result["metaboliteCount"]["value"]

data = data.append({
            'number' : str(int(int(x)-int(y))),
            'item' : 'allMetabolites',
        }, ignore_index=True)


#Metabolites - Unique

sparqlquery = '''SELECT count(distinct ?metabolite) as ?metaboliteCount  
WHERE {
  ?metabolite a wp:Metabolite ;
  dcterms:isPartOf ?pw.
  ?pw a wp:Pathway;
  dc:title ?title ;
  wp:organism ?organism ;
  wp:organismName "Homo sapiens"^^xsd:string .
}'''
SPARQL.setQuery(sparqlquery)
results = SPARQL.query().convert()

for result in results["results"]["bindings"]:
    x = result["metaboliteCount"]["value"]

sparqlquery = '''SELECT count(distinct ?metabolite) as ?metaboliteCount  
WHERE {
  ?metabolite a wp:Metabolite ;
  dcterms:isPartOf ?pw.
  ?pw a wp:Pathway;
  dc:title ?title ;
  wp:organism ?organism ;
  wp:organismName "Homo sapiens"^^xsd:string .
  
  OPTIONAL { ?metabolite wp:bdbWikidata ?wikidata . }  
  OPTIONAL { ?metabolite wp:bdbChEBI ?chebi . }  
  OPTIONAL { ?metabolite wp:bdbHmdb ?hmdb . }  
  OPTIONAL { ?metabolite wp:bdbChemspider ?chemspider . }  
  OPTIONAL { ?metabolite wp:bdbPubChem ?pubchem . } 

  FILTER (!BOUND(?wikidata))  
  FILTER (!BOUND(?chebi)) 
  FILTER (!BOUND(?hmdb))  
  FILTER (!BOUND(?chemspider))
  FILTER (!BOUND(?pubchem)) 
}'''
SPARQL.setQuery(sparqlquery)
results = SPARQL.query().convert()

for result in results["results"]["bindings"]:
    y = result["metaboliteCount"]["value"]

data = data.append({
            'number' : str(int(int(x)-int(y))),
            'item' : 'UniqueMetabolites',
        }, ignore_index=True)


#Interactions

sparqlquery = '''SELECT count( ?interaction) as ?countInteraction
WHERE {
  ?interaction a wp:Interaction ;
  dcterms:isPartOf ?pathway.
  ?pathway a wp:Pathway;
  wp:organism ?organism ;
  wp:organismName "Homo sapiens"^^xsd:string .
} '''
SPARQL.setQuery(sparqlquery)
results = SPARQL.query().convert()

for result in results["results"]["bindings"]:
    data = data.append({
            'number' : result["countInteraction"]["value"],
            'item' : 'allInteractions',
        }, ignore_index=True)


display(data)
end = time.time()
print("Code executed in " + str(end - start) + " seconds")

Unnamed: 0,item,number
0,allPathways,853
1,allGenes,44274
2,UniqueGenes,11523
3,allPublications,28500
4,UniquePublications,20097
5,allMetabolites,8948
6,UniqueMetabolites,3368
7,allInteractions,40699


Code executed in 0.3586742877960205 seconds


## Metadata for executing this Jupyter notebook

In [5]:
%load_ext watermark

#python, ipython, packages, and machine characteristics
%watermark -v -m -p sys,pip,SPARQLWrapper,pandas,time

#dte
print(" ")
%watermark -u -n -t -z

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
CPython 3.7.3
IPython 7.6.1

sys 3.7.3 (default, Mar 27 2019, 22:11:17) 
[GCC 7.3.0]
pip 20.2.3
SPARQLWrapper 1.8.2
pandas 0.24.2
time unknown

compiler   : GCC 7.3.0
system     : Linux
release    : 5.4.0-47-generic
machine    : x86_64
processor  : x86_64
CPU cores  : 8
interpreter: 64bit
 
last updated: Fri Oct 09 2020 11:39:15 CEST
